Total coverage: 255684 (16%)of 1618824
5 6 8 3 8 8 172 35 191 191 191 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #include <net/net_debug.h> #include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_netmems(pool, gfp); } netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); if (unlikely(!netmem)) return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return netmem; } static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmem(pool, offset, size, gfp); } static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmems(pool, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) { atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { page_pool_fragment_netmem(page_to_netmem(page), nr); } static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(pp_ref_count, 1); return ret; } static inline long page_pool_unref_page(struct page *page, long nr) { return page_pool_unref_netmem(page_to_netmem(page), nr); } static inline void page_pool_ref_netmem(netmem_ref netmem) { atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) { page_pool_ref_netmem(page_to_netmem(page)); } static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; } static inline void page_pool_put_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); #endif } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } static inline void page_pool_put_full_netmem(struct page_pool *pool, netmem_ref netmem, bool allow_direct) { page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, netmem_ref netmem) { page_pool_put_full_netmem(pool, netmem, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) { dma_addr_t ret = netmem_get_dma_addr(netmem); if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, const dma_addr_t dma_addr, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, dma_sync_size); } static inline void page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, const netmem_ref netmem, u32 offset, u32 dma_sync_size) { if (!pool->dma_sync_for_cpu) return; __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr_netmem(netmem), offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; } #endif /* _NET_PAGE_POOL_HELPERS_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0-only */ /* * CAN driver for PEAK System micro-CAN based adapters * * Copyright (C) 2003-2011 PEAK System-Technik GmbH * Copyright (C) 2011-2013 Stephane Grosjean <s.grosjean@peak-system.com> */ #ifndef PUCAN_H #define PUCAN_H /* uCAN commands opcodes list (low-order 10 bits) */ #define PUCAN_CMD_NOP 0x000 #define PUCAN_CMD_RESET_MODE 0x001 #define PUCAN_CMD_NORMAL_MODE 0x002 #define PUCAN_CMD_LISTEN_ONLY_MODE 0x003 #define PUCAN_CMD_TIMING_SLOW 0x004 #define PUCAN_CMD_TIMING_FAST 0x005 #define PUCAN_CMD_SET_STD_FILTER 0x006 #define PUCAN_CMD_RESERVED2 0x007 #define PUCAN_CMD_FILTER_STD 0x008 #define PUCAN_CMD_TX_ABORT 0x009 #define PUCAN_CMD_WR_ERR_CNT 0x00a #define PUCAN_CMD_SET_EN_OPTION 0x00b #define PUCAN_CMD_CLR_DIS_OPTION 0x00c #define PUCAN_CMD_RX_BARRIER 0x010 #define PUCAN_CMD_END_OF_COLLECTION 0x3ff /* uCAN received messages list */ #define PUCAN_MSG_CAN_RX 0x0001 #define PUCAN_MSG_ERROR 0x0002 #define PUCAN_MSG_STATUS 0x0003 #define PUCAN_MSG_BUSLOAD 0x0004 #define PUCAN_MSG_CACHE_CRITICAL 0x0102 /* uCAN transmitted messages */ #define PUCAN_MSG_CAN_TX 0x1000 /* uCAN command common header */ struct __packed pucan_command { __le16 opcode_channel; u16 args[3]; }; /* return the opcode from the opcode_channel field of a command */ static inline u16 pucan_cmd_get_opcode(struct pucan_command *c) { return le16_to_cpu(c->opcode_channel) & 0x3ff; } #define PUCAN_TSLOW_BRP_BITS 10 #define PUCAN_TSLOW_TSGEG1_BITS 8 #define PUCAN_TSLOW_TSGEG2_BITS 7 #define PUCAN_TSLOW_SJW_BITS 7 #define PUCAN_TSLOW_BRP_MASK ((1 << PUCAN_TSLOW_BRP_BITS) - 1) #define PUCAN_TSLOW_TSEG1_MASK ((1 << PUCAN_TSLOW_TSGEG1_BITS) - 1) #define PUCAN_TSLOW_TSEG2_MASK ((1 << PUCAN_TSLOW_TSGEG2_BITS) - 1) #define PUCAN_TSLOW_SJW_MASK ((1 << PUCAN_TSLOW_SJW_BITS) - 1) /* uCAN TIMING_SLOW command fields */ #define PUCAN_TSLOW_SJW_T(s, t) (((s) & PUCAN_TSLOW_SJW_MASK) | \ ((!!(t)) << 7)) #define PUCAN_TSLOW_TSEG2(t) ((t) & PUCAN_TSLOW_TSEG2_MASK) #define PUCAN_TSLOW_TSEG1(t) ((t) & PUCAN_TSLOW_TSEG1_MASK) #define PUCAN_TSLOW_BRP(b) ((b) & PUCAN_TSLOW_BRP_MASK) struct __packed pucan_timing_slow { __le16 opcode_channel; u8 ewl; /* Error Warning limit */ u8 sjw_t; /* Sync Jump Width + Triple sampling */ u8 tseg2; /* Timing SEGment 2 */ u8 tseg1; /* Timing SEGment 1 */ __le16 brp; /* BaudRate Prescaler */ }; #define PUCAN_TFAST_BRP_BITS 10 #define PUCAN_TFAST_TSGEG1_BITS 5 #define PUCAN_TFAST_TSGEG2_BITS 4 #define PUCAN_TFAST_SJW_BITS 4 #define PUCAN_TFAST_BRP_MASK ((1 << PUCAN_TFAST_BRP_BITS) - 1) #define PUCAN_TFAST_TSEG1_MASK ((1 << PUCAN_TFAST_TSGEG1_BITS) - 1) #define PUCAN_TFAST_TSEG2_MASK ((1 << PUCAN_TFAST_TSGEG2_BITS) - 1) #define PUCAN_TFAST_SJW_MASK ((1 << PUCAN_TFAST_SJW_BITS) - 1) /* uCAN TIMING_FAST command fields */ #define PUCAN_TFAST_SJW(s) ((s) & PUCAN_TFAST_SJW_MASK) #define PUCAN_TFAST_TSEG2(t) ((t) & PUCAN_TFAST_TSEG2_MASK) #define PUCAN_TFAST_TSEG1(t) ((t) & PUCAN_TFAST_TSEG1_MASK) #define PUCAN_TFAST_BRP(b) ((b) & PUCAN_TFAST_BRP_MASK) struct __packed pucan_timing_fast { __le16 opcode_channel; u8 unused; u8 sjw; /* Sync Jump Width */ u8 tseg2; /* Timing SEGment 2 */ u8 tseg1; /* Timing SEGment 1 */ __le16 brp; /* BaudRate Prescaler */ }; /* uCAN FILTER_STD command fields */ #define PUCAN_FLTSTD_ROW_IDX_BITS 6 struct __packed pucan_filter_std { __le16 opcode_channel; __le16 idx; __le32 mask; /* CAN-ID bitmask in idx range */ }; #define PUCAN_FLTSTD_ROW_IDX_MAX ((1 << PUCAN_FLTSTD_ROW_IDX_BITS) - 1) /* uCAN SET_STD_FILTER command fields */ struct __packed pucan_std_filter { __le16 opcode_channel; u8 unused; u8 idx; __le32 mask; /* CAN-ID bitmask in idx range */ }; /* uCAN TX_ABORT commands fields */ #define PUCAN_TX_ABORT_FLUSH 0x0001 struct __packed pucan_tx_abort { __le16 opcode_channel; __le16 flags; u32 unused; }; /* uCAN WR_ERR_CNT command fields */ #define PUCAN_WRERRCNT_TE 0x4000 /* Tx error cntr write Enable */ #define PUCAN_WRERRCNT_RE 0x8000 /* Rx error cntr write Enable */ struct __packed pucan_wr_err_cnt { __le16 opcode_channel; __le16 sel_mask; u8 tx_counter; /* Tx error counter new value */ u8 rx_counter; /* Rx error counter new value */ u16 unused; }; /* uCAN SET_EN/CLR_DIS _OPTION command fields */ #define PUCAN_OPTION_ERROR 0x0001 #define PUCAN_OPTION_BUSLOAD 0x0002 #define PUCAN_OPTION_CANDFDISO 0x0004 struct __packed pucan_options { __le16 opcode_channel; __le16 options; u32 unused; }; /* uCAN received messages global format */ struct __packed pucan_msg { __le16 size; __le16 type; __le32 ts_low; __le32 ts_high; }; /* uCAN flags for CAN/CANFD messages */ #define PUCAN_MSG_SELF_RECEIVE 0x80 #define PUCAN_MSG_ERROR_STATE_IND 0x40 /* error state indicator */ #define PUCAN_MSG_BITRATE_SWITCH 0x20 /* bitrate switch */ #define PUCAN_MSG_EXT_DATA_LEN 0x10 /* extended data length */ #define PUCAN_MSG_SINGLE_SHOT 0x08 #define PUCAN_MSG_LOOPED_BACK 0x04 #define PUCAN_MSG_EXT_ID 0x02 #define PUCAN_MSG_RTR 0x01 struct __packed pucan_rx_msg { __le16 size; __le16 type; __le32 ts_low; __le32 ts_high; __le32 tag_low; __le32 tag_high; u8 channel_dlc; u8 client; __le16 flags; __le32 can_id; u8 d[]; }; /* uCAN error types */ #define PUCAN_ERMSG_BIT_ERROR 0 #define PUCAN_ERMSG_FORM_ERROR 1 #define PUCAN_ERMSG_STUFF_ERROR 2 #define PUCAN_ERMSG_OTHER_ERROR 3 #define PUCAN_ERMSG_ERR_CNT_DEC 4 struct __packed pucan_error_msg { __le16 size; __le16 type; __le32 ts_low; __le32 ts_high; u8 channel_type_d; u8 code_g; u8 tx_err_cnt; u8 rx_err_cnt; }; static inline int pucan_error_get_channel(const struct pucan_error_msg *msg) { return msg->channel_type_d & 0x0f; } #define PUCAN_RX_BARRIER 0x10 #define PUCAN_BUS_PASSIVE 0x20 #define PUCAN_BUS_WARNING 0x40 #define PUCAN_BUS_BUSOFF 0x80 struct __packed pucan_status_msg { __le16 size; __le16 type; __le32 ts_low; __le32 ts_high; u8 channel_p_w_b; u8 unused[3]; }; static inline int pucan_status_get_channel(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & 0x0f; } static inline int pucan_status_is_rx_barrier(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & PUCAN_RX_BARRIER; } static inline int pucan_status_is_passive(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & PUCAN_BUS_PASSIVE; } static inline int pucan_status_is_warning(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & PUCAN_BUS_WARNING; } static inline int pucan_status_is_busoff(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & PUCAN_BUS_BUSOFF; } /* uCAN transmitted message format */ #define PUCAN_MSG_CHANNEL_DLC(c, d) (((c) & 0xf) | ((d) << 4)) struct __packed pucan_tx_msg { __le16 size; __le16 type; __le32 tag_low; __le32 tag_high; u8 channel_dlc; u8 client; __le16 flags; __le32 can_id; u8 d[]; }; /* build the cmd opcode_channel field with respect to the correct endianness */ static inline __le16 pucan_cmd_opcode_channel(int index, int opcode) { return cpu_to_le16(((index) << 12) | ((opcode) & 0x3ff)); } /* return the channel number part from any received message channel_dlc field */ static inline int pucan_msg_get_channel(const struct pucan_rx_msg *msg) { return msg->channel_dlc & 0xf; } /* return the dlc value from any received message channel_dlc field */ static inline u8 pucan_msg_get_dlc(const struct pucan_rx_msg *msg) { return msg->channel_dlc >> 4; } static inline int pucan_ermsg_get_channel(const struct pucan_error_msg *msg) { return msg->channel_type_d & 0x0f; } static inline int pucan_stmsg_get_channel(const struct pucan_status_msg *msg) { return msg->channel_p_w_b & 0x0f; } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Diolan u2c-12 USB-I2C adapter * * Copyright (c) 2010-2011 Ericsson AB * * Derived from: * i2c-tiny-usb.c * Copyright (C) 2006-2007 Till Harbaum (Till@Harbaum.org) */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/i2c.h> #define DRIVER_NAME "i2c-diolan-u2c" #define USB_VENDOR_ID_DIOLAN 0x0abf #define USB_DEVICE_ID_DIOLAN_U2C 0x3370 /* commands via USB, must match command ids in the firmware */ #define CMD_I2C_READ 0x01 #define CMD_I2C_WRITE 0x02 #define CMD_I2C_SCAN 0x03 /* Returns list of detected devices */ #define CMD_I2C_RELEASE_SDA 0x04 #define CMD_I2C_RELEASE_SCL 0x05 #define CMD_I2C_DROP_SDA 0x06 #define CMD_I2C_DROP_SCL 0x07 #define CMD_I2C_READ_SDA 0x08 #define CMD_I2C_READ_SCL 0x09 #define CMD_GET_FW_VERSION 0x0a #define CMD_GET_SERIAL 0x0b #define CMD_I2C_START 0x0c #define CMD_I2C_STOP 0x0d #define CMD_I2C_REPEATED_START 0x0e #define CMD_I2C_PUT_BYTE 0x0f #define CMD_I2C_GET_BYTE 0x10 #define CMD_I2C_PUT_ACK 0x11 #define CMD_I2C_GET_ACK 0x12 #define CMD_I2C_PUT_BYTE_ACK 0x13 #define CMD_I2C_GET_BYTE_ACK 0x14 #define CMD_I2C_SET_SPEED 0x1b #define CMD_I2C_GET_SPEED 0x1c #define CMD_I2C_SET_CLK_SYNC 0x24 #define CMD_I2C_GET_CLK_SYNC 0x25 #define CMD_I2C_SET_CLK_SYNC_TO 0x26 #define CMD_I2C_GET_CLK_SYNC_TO 0x27 #define RESP_OK 0x00 #define RESP_FAILED 0x01 #define RESP_BAD_MEMADDR 0x04 #define RESP_DATA_ERR 0x05 #define RESP_NOT_IMPLEMENTED 0x06 #define RESP_NACK 0x07 #define RESP_TIMEOUT 0x09 #define U2C_I2C_SPEED_FAST 0 /* 400 kHz */ #define U2C_I2C_SPEED_STD 1 /* 100 kHz */ #define U2C_I2C_SPEED_2KHZ 242 /* 2 kHz, minimum speed */ #define U2C_I2C_SPEED(f) ((DIV_ROUND_UP(1000000, (f)) - 10) / 2 + 1) #define U2C_I2C_FREQ(s) (1000000 / (2 * (s - 1) + 10)) #define DIOLAN_USB_TIMEOUT 100 /* in ms */ #define DIOLAN_SYNC_TIMEOUT 20 /* in ms */ #define DIOLAN_OUTBUF_LEN 128 #define DIOLAN_FLUSH_LEN (DIOLAN_OUTBUF_LEN - 4) #define DIOLAN_INBUF_LEN 256 /* Maximum supported receive length */ /* Structure to hold all of our device specific stuff */ struct i2c_diolan_u2c { u8 obuffer[DIOLAN_OUTBUF_LEN]; /* output buffer */ u8 ibuffer[DIOLAN_INBUF_LEN]; /* input buffer */ int ep_in, ep_out; /* Endpoints */ struct usb_device *usb_dev; /* the usb device for this device */ struct usb_interface *interface;/* the interface for this device */ struct i2c_adapter adapter; /* i2c related things */ int olen; /* Output buffer length */ int ocount; /* Number of enqueued messages */ }; static uint frequency = I2C_MAX_STANDARD_MODE_FREQ; /* I2C clock frequency in Hz */ module_param(frequency, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(frequency, "I2C clock frequency in hertz"); /* usb layer */ /* Send command to device, and get response. */ static int diolan_usb_transfer(struct i2c_diolan_u2c *dev) { int ret = 0; int actual; int i; if (!dev->olen || !dev->ocount) return -EINVAL; ret = usb_bulk_msg(dev->usb_dev, usb_sndbulkpipe(dev->usb_dev, dev->ep_out), dev->obuffer, dev->olen, &actual, DIOLAN_USB_TIMEOUT); if (!ret) { for (i = 0; i < dev->ocount; i++) { int tmpret; tmpret = usb_bulk_msg(dev->usb_dev, usb_rcvbulkpipe(dev->usb_dev, dev->ep_in), dev->ibuffer, sizeof(dev->ibuffer), &actual, DIOLAN_USB_TIMEOUT); /* * Stop command processing if a previous command * returned an error. * Note that we still need to retrieve all messages. */ if (ret < 0) continue; ret = tmpret; if (ret == 0 && actual > 0) { switch (dev->ibuffer[actual - 1]) { case RESP_NACK: /* * Return ENXIO if NACK was received as * response to the address phase, * EIO otherwise */ ret = i == 1 ? -ENXIO : -EIO; break; case RESP_TIMEOUT: ret = -ETIMEDOUT; break; case RESP_OK: /* strip off return code */ ret = actual - 1; break; default: ret = -EIO; break; } } } } dev->olen = 0; dev->ocount = 0; return ret; } static int diolan_write_cmd(struct i2c_diolan_u2c *dev, bool flush) { if (flush || dev->olen >= DIOLAN_FLUSH_LEN) return diolan_usb_transfer(dev); return 0; } /* Send command (no data) */ static int diolan_usb_cmd(struct i2c_diolan_u2c *dev, u8 command, bool flush) { dev->obuffer[dev->olen++] = command; dev->ocount++; return diolan_write_cmd(dev, flush); } /* Send command with one byte of data */ static int diolan_usb_cmd_data(struct i2c_diolan_u2c *dev, u8 command, u8 data, bool flush) { dev->obuffer[dev->olen++] = command; dev->obuffer[dev->olen++] = data; dev->ocount++; return diolan_write_cmd(dev, flush); } /* Send command with two bytes of data */ static int diolan_usb_cmd_data2(struct i2c_diolan_u2c *dev, u8 command, u8 d1, u8 d2, bool flush) { dev->obuffer[dev->olen++] = command; dev->obuffer[dev->olen++] = d1; dev->obuffer[dev->olen++] = d2; dev->ocount++; return diolan_write_cmd(dev, flush); } /* * Flush input queue. * If we don't do this at startup and the controller has queued up * messages which were not retrieved, it will stop responding * at some point. */ static void diolan_flush_input(struct i2c_diolan_u2c *dev) { int i; for (i = 0; i < 10; i++) { int actual = 0; int ret; ret = usb_bulk_msg(dev->usb_dev, usb_rcvbulkpipe(dev->usb_dev, dev->ep_in), dev->ibuffer, sizeof(dev->ibuffer), &actual, DIOLAN_USB_TIMEOUT); if (ret < 0 || actual == 0) break; } if (i == 10) dev_err(&dev->interface->dev, "Failed to flush input buffer\n"); } static int diolan_i2c_start(struct i2c_diolan_u2c *dev) { return diolan_usb_cmd(dev, CMD_I2C_START, false); } static int diolan_i2c_repeated_start(struct i2c_diolan_u2c *dev) { return diolan_usb_cmd(dev, CMD_I2C_REPEATED_START, false); } static int diolan_i2c_stop(struct i2c_diolan_u2c *dev) { return diolan_usb_cmd(dev, CMD_I2C_STOP, true); } static int diolan_i2c_get_byte_ack(struct i2c_diolan_u2c *dev, bool ack, u8 *byte) { int ret; ret = diolan_usb_cmd_data(dev, CMD_I2C_GET_BYTE_ACK, ack, true); if (ret > 0) *byte = dev->ibuffer[0]; else if (ret == 0) ret = -EIO; return ret; } static int diolan_i2c_put_byte_ack(struct i2c_diolan_u2c *dev, u8 byte) { return diolan_usb_cmd_data(dev, CMD_I2C_PUT_BYTE_ACK, byte, false); } static int diolan_set_speed(struct i2c_diolan_u2c *dev, u8 speed) { return diolan_usb_cmd_data(dev, CMD_I2C_SET_SPEED, speed, true); } /* Enable or disable clock synchronization (stretching) */ static int diolan_set_clock_synch(struct i2c_diolan_u2c *dev, bool enable) { return diolan_usb_cmd_data(dev, CMD_I2C_SET_CLK_SYNC, enable, true); } /* Set clock synchronization timeout in ms */ static int diolan_set_clock_synch_timeout(struct i2c_diolan_u2c *dev, int ms) { int to_val = ms * 10; return diolan_usb_cmd_data2(dev, CMD_I2C_SET_CLK_SYNC_TO, to_val & 0xff, (to_val >> 8) & 0xff, true); } static void diolan_fw_version(struct i2c_diolan_u2c *dev) { int ret; ret = diolan_usb_cmd(dev, CMD_GET_FW_VERSION, true); if (ret >= 2) dev_info(&dev->interface->dev, "Diolan U2C firmware version %u.%u\n", (unsigned int)dev->ibuffer[0], (unsigned int)dev->ibuffer[1]); } static void diolan_get_serial(struct i2c_diolan_u2c *dev) { int ret; u32 serial; ret = diolan_usb_cmd(dev, CMD_GET_SERIAL, true); if (ret >= 4) { serial = le32_to_cpu(*(u32 *)dev->ibuffer); dev_info(&dev->interface->dev, "Diolan U2C serial number %u\n", serial); } } static int diolan_init(struct i2c_diolan_u2c *dev) { int speed, ret; if (frequency >= 2 * I2C_MAX_STANDARD_MODE_FREQ) { speed = U2C_I2C_SPEED_FAST; frequency = I2C_MAX_FAST_MODE_FREQ; } else if (frequency >= I2C_MAX_STANDARD_MODE_FREQ || frequency == 0) { speed = U2C_I2C_SPEED_STD; frequency = I2C_MAX_STANDARD_MODE_FREQ; } else { speed = U2C_I2C_SPEED(frequency); if (speed > U2C_I2C_SPEED_2KHZ) speed = U2C_I2C_SPEED_2KHZ; frequency = U2C_I2C_FREQ(speed); } dev_info(&dev->interface->dev, "Diolan U2C at USB bus %03d address %03d speed %d Hz\n", dev->usb_dev->bus->busnum, dev->usb_dev->devnum, frequency); diolan_flush_input(dev); diolan_fw_version(dev); diolan_get_serial(dev); /* Set I2C speed */ ret = diolan_set_speed(dev, speed); if (ret < 0) return ret; /* Configure I2C clock synchronization */ ret = diolan_set_clock_synch(dev, speed != U2C_I2C_SPEED_FAST); if (ret < 0) return ret; if (speed != U2C_I2C_SPEED_FAST) ret = diolan_set_clock_synch_timeout(dev, DIOLAN_SYNC_TIMEOUT); return ret; } /* i2c layer */ static int diolan_usb_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, int num) { struct i2c_diolan_u2c *dev = i2c_get_adapdata(adapter); struct i2c_msg *pmsg; int i, j; int ret, sret; ret = diolan_i2c_start(dev); if (ret < 0) return ret; for (i = 0; i < num; i++) { pmsg = &msgs[i]; if (i) { ret = diolan_i2c_repeated_start(dev); if (ret < 0) goto abort; } ret = diolan_i2c_put_byte_ack(dev, i2c_8bit_addr_from_msg(pmsg)); if (ret < 0) goto abort; if (pmsg->flags & I2C_M_RD) { for (j = 0; j < pmsg->len; j++) { u8 byte; bool ack = j < pmsg->len - 1; /* * Don't send NACK if this is the first byte * of a SMBUS_BLOCK message. */ if (j == 0 && (pmsg->flags & I2C_M_RECV_LEN)) ack = true; ret = diolan_i2c_get_byte_ack(dev, ack, &byte); if (ret < 0) goto abort; /* * Adjust count if first received byte is length */ if (j == 0 && (pmsg->flags & I2C_M_RECV_LEN)) { if (byte == 0 || byte > I2C_SMBUS_BLOCK_MAX) { ret = -EPROTO; goto abort; } pmsg->len += byte; } pmsg->buf[j] = byte; } } else { for (j = 0; j < pmsg->len; j++) { ret = diolan_i2c_put_byte_ack(dev, pmsg->buf[j]); if (ret < 0) goto abort; } } } ret = num; abort: sret = diolan_i2c_stop(dev); if (sret < 0 && ret >= 0) ret = sret; return ret; } /* * Return list of supported functionality. */ static u32 diolan_usb_func(struct i2c_adapter *a) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL | I2C_FUNC_SMBUS_READ_BLOCK_DATA | I2C_FUNC_SMBUS_BLOCK_PROC_CALL; } static const struct i2c_algorithm diolan_usb_algorithm = { .xfer = diolan_usb_xfer, .functionality = diolan_usb_func, }; /* device layer */ static const struct usb_device_id diolan_u2c_table[] = { { USB_DEVICE(USB_VENDOR_ID_DIOLAN, USB_DEVICE_ID_DIOLAN_U2C) }, { } }; MODULE_DEVICE_TABLE(usb, diolan_u2c_table); static void diolan_u2c_free(struct i2c_diolan_u2c *dev) { usb_put_dev(dev->usb_dev); kfree(dev); } static int diolan_u2c_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_host_interface *hostif = interface->cur_altsetting; struct i2c_diolan_u2c *dev; int ret; if (hostif->desc.bInterfaceNumber != 0 || hostif->desc.bNumEndpoints < 2) return -ENODEV; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (dev == NULL) { ret = -ENOMEM; goto error; } dev->ep_out = hostif->endpoint[0].desc.bEndpointAddress; dev->ep_in = hostif->endpoint[1].desc.bEndpointAddress; dev->usb_dev = usb_get_dev(interface_to_usbdev(interface)); dev->interface = interface; /* save our data pointer in this interface device */ usb_set_intfdata(interface, dev); /* setup i2c adapter description */ dev->adapter.owner = THIS_MODULE; dev->adapter.class = I2C_CLASS_HWMON; dev->adapter.algo = &diolan_usb_algorithm; i2c_set_adapdata(&dev->adapter, dev); snprintf(dev->adapter.name, sizeof(dev->adapter.name), DRIVER_NAME " at bus %03d device %03d", dev->usb_dev->bus->busnum, dev->usb_dev->devnum); dev->adapter.dev.parent = &dev->interface->dev; /* initialize diolan i2c interface */ ret = diolan_init(dev); if (ret < 0) { dev_err(&interface->dev, "failed to initialize adapter\n"); goto error_free; } /* and finally attach to i2c layer */ ret = i2c_add_adapter(&dev->adapter); if (ret < 0) goto error_free; dev_dbg(&interface->dev, "connected " DRIVER_NAME "\n"); return 0; error_free: usb_set_intfdata(interface, NULL); diolan_u2c_free(dev); error: return ret; } static void diolan_u2c_disconnect(struct usb_interface *interface) { struct i2c_diolan_u2c *dev = usb_get_intfdata(interface); i2c_del_adapter(&dev->adapter); usb_set_intfdata(interface, NULL); diolan_u2c_free(dev); dev_dbg(&interface->dev, "disconnected\n"); } static struct usb_driver diolan_u2c_driver = { .name = DRIVER_NAME, .probe = diolan_u2c_probe, .disconnect = diolan_u2c_disconnect, .id_table = diolan_u2c_table, }; module_usb_driver(diolan_u2c_driver); MODULE_AUTHOR("Guenter Roeck <linux@roeck-us.net>"); MODULE_DESCRIPTION(DRIVER_NAME " driver"); MODULE_LICENSE("GPL");
4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 #ifndef __DRM_VMA_MANAGER_H__ #define __DRM_VMA_MANAGER_H__ /* * Copyright (c) 2013 David Herrmann <dh.herrmann@gmail.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <drm/drm_mm.h> #include <linux/mm.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/types.h> /* We make up offsets for buffer objects so we can recognize them at * mmap time. pgoff in mmap is an unsigned long, so we need to make sure * that the faked up offset will fit */ #if BITS_PER_LONG == 64 #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFFUL >> PAGE_SHIFT) + 1) #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 256) #else #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFUL >> PAGE_SHIFT) + 1) #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFUL >> PAGE_SHIFT) * 16) #endif struct drm_file; struct drm_vma_offset_file { struct rb_node vm_rb; struct drm_file *vm_tag; unsigned long vm_count; }; struct drm_vma_offset_node { rwlock_t vm_lock; struct drm_mm_node vm_node; struct rb_root vm_files; void *driver_private; }; struct drm_vma_offset_manager { rwlock_t vm_lock; struct drm_mm vm_addr_space_mm; }; void drm_vma_offset_manager_init(struct drm_vma_offset_manager *mgr, unsigned long page_offset, unsigned long size); void drm_vma_offset_manager_destroy(struct drm_vma_offset_manager *mgr); struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_manager *mgr, unsigned long start, unsigned long pages); int drm_vma_offset_add(struct drm_vma_offset_manager *mgr, struct drm_vma_offset_node *node, unsigned long pages); void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr, struct drm_vma_offset_node *node); int drm_vma_node_allow(struct drm_vma_offset_node *node, struct drm_file *tag); int drm_vma_node_allow_once(struct drm_vma_offset_node *node, struct drm_file *tag); void drm_vma_node_revoke(struct drm_vma_offset_node *node, struct drm_file *tag); bool drm_vma_node_is_allowed(struct drm_vma_offset_node *node, struct drm_file *tag); /** * drm_vma_offset_exact_lookup_locked() - Look up node by exact address * @mgr: Manager object * @start: Start address (page-based, not byte-based) * @pages: Size of object (page-based) * * Same as drm_vma_offset_lookup_locked() but does not allow any offset into the node. * It only returns the exact object with the given start address. * * RETURNS: * Node at exact start address @start. */ static inline struct drm_vma_offset_node * drm_vma_offset_exact_lookup_locked(struct drm_vma_offset_manager *mgr, unsigned long start, unsigned long pages) { struct drm_vma_offset_node *node; node = drm_vma_offset_lookup_locked(mgr, start, pages); return (node && node->vm_node.start == start) ? node : NULL; } /** * drm_vma_offset_lock_lookup() - Lock lookup for extended private use * @mgr: Manager object * * Lock VMA manager for extended lookups. Only locked VMA function calls * are allowed while holding this lock. All other contexts are blocked from VMA * until the lock is released via drm_vma_offset_unlock_lookup(). * * Use this if you need to take a reference to the objects returned by * drm_vma_offset_lookup_locked() before releasing this lock again. * * This lock must not be used for anything else than extended lookups. You must * not call any other VMA helpers while holding this lock. * * Note: You're in atomic-context while holding this lock! */ static inline void drm_vma_offset_lock_lookup(struct drm_vma_offset_manager *mgr) { read_lock(&mgr->vm_lock); } /** * drm_vma_offset_unlock_lookup() - Unlock lookup for extended private use * @mgr: Manager object * * Release lookup-lock. See drm_vma_offset_lock_lookup() for more information. */ static inline void drm_vma_offset_unlock_lookup(struct drm_vma_offset_manager *mgr) { read_unlock(&mgr->vm_lock); } /** * drm_vma_node_reset() - Initialize or reset node object * @node: Node to initialize or reset * * Reset a node to its initial state. This must be called before using it with * any VMA offset manager. * * This must not be called on an already allocated node, or you will leak * memory. */ static inline void drm_vma_node_reset(struct drm_vma_offset_node *node) { memset(node, 0, sizeof(*node)); node->vm_files = RB_ROOT; rwlock_init(&node->vm_lock); } /** * drm_vma_node_start() - Return start address for page-based addressing * @node: Node to inspect * * Return the start address of the given node. This can be used as offset into * the linear VM space that is provided by the VMA offset manager. Note that * this can only be used for page-based addressing. If you need a proper offset * for user-space mappings, you must apply "<< PAGE_SHIFT" or use the * drm_vma_node_offset_addr() helper instead. * * RETURNS: * Start address of @node for page-based addressing. 0 if the node does not * have an offset allocated. */ static inline unsigned long drm_vma_node_start(const struct drm_vma_offset_node *node) { return node->vm_node.start; } /** * drm_vma_node_size() - Return size (page-based) * @node: Node to inspect * * Return the size as number of pages for the given node. This is the same size * that was passed to drm_vma_offset_add(). If no offset is allocated for the * node, this is 0. * * RETURNS: * Size of @node as number of pages. 0 if the node does not have an offset * allocated. */ static inline unsigned long drm_vma_node_size(struct drm_vma_offset_node *node) { return node->vm_node.size; } /** * drm_vma_node_offset_addr() - Return sanitized offset for user-space mmaps * @node: Linked offset node * * Same as drm_vma_node_start() but returns the address as a valid offset that * can be used for user-space mappings during mmap(). * This must not be called on unlinked nodes. * * RETURNS: * Offset of @node for byte-based addressing. 0 if the node does not have an * object allocated. */ static inline __u64 drm_vma_node_offset_addr(struct drm_vma_offset_node *node) { return ((__u64)node->vm_node.start) << PAGE_SHIFT; } /** * drm_vma_node_unmap() - Unmap offset node * @node: Offset node * @file_mapping: Address space to unmap @node from * * Unmap all userspace mappings for a given offset node. The mappings must be * associated with the @file_mapping address-space. If no offset exists * nothing is done. * * This call is unlocked. The caller must guarantee that drm_vma_offset_remove() * is not called on this node concurrently. */ static inline void drm_vma_node_unmap(struct drm_vma_offset_node *node, struct address_space *file_mapping) { if (drm_mm_node_allocated(&node->vm_node)) unmap_mapping_range(file_mapping, drm_vma_node_offset_addr(node), drm_vma_node_size(node) << PAGE_SHIFT, 1); } /** * drm_vma_node_verify_access() - Access verification helper for TTM * @node: Offset node * @tag: Tag of file to check * * This checks whether @tag is granted access to @node. It is the same as * drm_vma_node_is_allowed() but suitable as drop-in helper for TTM * verify_access() callbacks. * * RETURNS: * 0 if access is granted, -EACCES otherwise. */ static inline int drm_vma_node_verify_access(struct drm_vma_offset_node *node, struct drm_file *tag) { return drm_vma_node_is_allowed(node, tag) ? 0 : -EACCES; } #endif /* __DRM_VMA_MANAGER_H__ */
5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 // SPDX-License-Identifier: GPL-2.0-or-later /* * uvc_metadata.c -- USB Video Class driver - Metadata handling * * Copyright (C) 2016 * Guennadi Liakhovetski (guennadi.liakhovetski@intel.com) */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/uvc.h> #include <linux/videodev2.h> #include <media/v4l2-ioctl.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-vmalloc.h> #include "uvcvideo.h" /* ----------------------------------------------------------------------------- * V4L2 ioctls */ static int uvc_meta_v4l2_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { struct v4l2_fh *vfh = file->private_data; struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_video_chain *chain = stream->chain; strscpy(cap->driver, "uvcvideo", sizeof(cap->driver)); strscpy(cap->card, stream->dev->name, sizeof(cap->card)); usb_make_path(stream->dev->udev, cap->bus_info, sizeof(cap->bus_info)); cap->capabilities = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_STREAMING | chain->caps; return 0; } static int uvc_meta_v4l2_get_format(struct file *file, void *fh, struct v4l2_format *format) { struct v4l2_fh *vfh = file->private_data; struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct v4l2_meta_format *fmt = &format->fmt.meta; if (format->type != vfh->vdev->queue->type) return -EINVAL; memset(fmt, 0, sizeof(*fmt)); fmt->dataformat = stream->meta.format; fmt->buffersize = UVC_METADATA_BUF_SIZE; return 0; } static int uvc_meta_v4l2_try_format(struct file *file, void *fh, struct v4l2_format *format) { struct v4l2_fh *vfh = file->private_data; struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_device *dev = stream->dev; struct v4l2_meta_format *fmt = &format->fmt.meta; u32 fmeta = V4L2_META_FMT_UVC; if (format->type != vfh->vdev->queue->type) return -EINVAL; for (unsigned int i = 0; i < dev->nmeta_formats; i++) if (dev->meta_formats[i] == fmt->dataformat) { fmeta = fmt->dataformat; break; } memset(fmt, 0, sizeof(*fmt)); fmt->dataformat = fmeta; fmt->buffersize = UVC_METADATA_BUF_SIZE; return 0; } static int uvc_meta_v4l2_set_format(struct file *file, void *fh, struct v4l2_format *format) { struct v4l2_fh *vfh = file->private_data; struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct v4l2_meta_format *fmt = &format->fmt.meta; int ret; ret = uvc_meta_v4l2_try_format(file, fh, format); if (ret < 0) return ret; /* * We could in principle switch at any time, also during streaming. * Metadata buffers would still be perfectly parseable, but it's more * consistent and cleaner to disallow that. */ mutex_lock(&stream->mutex); if (vb2_is_busy(&stream->meta.queue.queue)) ret = -EBUSY; else stream->meta.format = fmt->dataformat; mutex_unlock(&stream->mutex); return ret; } static int uvc_meta_v4l2_enum_formats(struct file *file, void *fh, struct v4l2_fmtdesc *fdesc) { struct v4l2_fh *vfh = file->private_data; struct uvc_streaming *stream = video_get_drvdata(vfh->vdev); struct uvc_device *dev = stream->dev; u32 i = fdesc->index; if (fdesc->type != vfh->vdev->queue->type) return -EINVAL; if (i >= dev->nmeta_formats) return -EINVAL; memset(fdesc, 0, sizeof(*fdesc)); fdesc->type = vfh->vdev->queue->type; fdesc->index = i; fdesc->pixelformat = dev->meta_formats[i]; return 0; } static const struct v4l2_ioctl_ops uvc_meta_ioctl_ops = { .vidioc_querycap = uvc_meta_v4l2_querycap, .vidioc_g_fmt_meta_cap = uvc_meta_v4l2_get_format, .vidioc_s_fmt_meta_cap = uvc_meta_v4l2_set_format, .vidioc_try_fmt_meta_cap = uvc_meta_v4l2_try_format, .vidioc_enum_fmt_meta_cap = uvc_meta_v4l2_enum_formats, .vidioc_reqbufs = vb2_ioctl_reqbufs, .vidioc_querybuf = vb2_ioctl_querybuf, .vidioc_qbuf = vb2_ioctl_qbuf, .vidioc_dqbuf = vb2_ioctl_dqbuf, .vidioc_create_bufs = vb2_ioctl_create_bufs, .vidioc_prepare_buf = vb2_ioctl_prepare_buf, .vidioc_streamon = vb2_ioctl_streamon, .vidioc_streamoff = vb2_ioctl_streamoff, }; /* ----------------------------------------------------------------------------- * V4L2 File Operations */ static const struct v4l2_file_operations uvc_meta_fops = { .owner = THIS_MODULE, .unlocked_ioctl = video_ioctl2, .open = v4l2_fh_open, .release = vb2_fop_release, .poll = vb2_fop_poll, .mmap = vb2_fop_mmap, }; static struct uvc_entity *uvc_meta_find_msxu(struct uvc_device *dev) { static const u8 uvc_msxu_guid[16] = UVC_GUID_MSXU_1_5; struct uvc_entity *entity; list_for_each_entry(entity, &dev->entities, list) { if (!memcmp(entity->guid, uvc_msxu_guid, sizeof(entity->guid))) return entity; } return NULL; } #define MSXU_CONTROL_METADATA 0x9 static int uvc_meta_detect_msxu(struct uvc_device *dev) { u32 *data __free(kfree) = NULL; struct uvc_entity *entity; int ret; entity = uvc_meta_find_msxu(dev); if (!entity) return 0; /* * USB requires buffers aligned in a special way, simplest way is to * make sure that query_ctrl will work is to kmalloc() them. */ data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; /* Check if the metadata is already enabled. */ ret = uvc_query_ctrl(dev, UVC_GET_CUR, entity->id, dev->intfnum, MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret) return 0; if (*data) { dev->quirks |= UVC_QUIRK_MSXU_META; return 0; } /* * We have seen devices that require 1 to enable the metadata, others * requiring a value != 1 and others requiring a value >1. Luckily for * us, the value from GET_MAX seems to work all the time. */ ret = uvc_query_ctrl(dev, UVC_GET_MAX, entity->id, dev->intfnum, MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret || !*data) return 0; /* * If we can set MSXU_CONTROL_METADATA, the device will report * metadata. */ ret = uvc_query_ctrl(dev, UVC_SET_CUR, entity->id, dev->intfnum, MSXU_CONTROL_METADATA, data, sizeof(*data)); if (!ret) dev->quirks |= UVC_QUIRK_MSXU_META; return 0; } int uvc_meta_register(struct uvc_streaming *stream) { struct uvc_device *dev = stream->dev; struct video_device *vdev = &stream->meta.vdev; struct uvc_video_queue *queue = &stream->meta.queue; stream->meta.format = V4L2_META_FMT_UVC; return uvc_register_video_device(dev, stream, vdev, queue, V4L2_BUF_TYPE_META_CAPTURE, &uvc_meta_fops, &uvc_meta_ioctl_ops); } int uvc_meta_init(struct uvc_device *dev) { unsigned int i = 0; int ret; ret = uvc_meta_detect_msxu(dev); if (ret) return ret; dev->meta_formats[i++] = V4L2_META_FMT_UVC; if (dev->info->meta_format && !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC)) dev->meta_formats[i++] = dev->info->meta_format; if (dev->quirks & UVC_QUIRK_MSXU_META && !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC_MSXU_1_5)) dev->meta_formats[i++] = V4L2_META_FMT_UVC_MSXU_1_5; /* IMPORTANT: for new meta-formats update UVC_MAX_META_DATA_FORMATS. */ dev->nmeta_formats = i; return 0; }
569 273 49 571 571 573 572 581 16 577 572 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/reboot.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* * Used internally by the kernel to suspend measurements. * Protected by ima_extend_list_mutex. */ static bool ima_measurements_suspended; /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); /* * Avoid appending to the measurement log when the TPM subsystem has * been shut down while preparing for system reboot. */ if (ima_measurements_suspended) { audit_cause = "measurements_suspended"; audit_info = 0; result = -ENODEV; goto out; } if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } static void ima_measurements_suspend(void) { mutex_lock(&ima_extend_list_mutex); ima_measurements_suspended = true; mutex_unlock(&ima_extend_list_mutex); } static int ima_reboot_notifier(struct notifier_block *nb, unsigned long action, void *data) { #ifdef CONFIG_IMA_KEXEC if (action == SYS_RESTART && data && !strcmp(data, "kexec reboot")) ima_measure_kexec_event("kexec_execute"); #endif ima_measurements_suspend(); return NOTIFY_DONE; } static struct notifier_block ima_reboot_nb = { .notifier_call = ima_reboot_notifier, }; void __init ima_init_reboot_notifier(void) { register_reboot_notifier(&ima_reboot_nb); } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro * * Copyright (C) 2010-2012 esd electronic system design gmbh, Matthias Fuchs <socketcan@esd.eu> * Copyright (C) 2022-2024 esd electronics gmbh, Frank Jungclaus <frank.jungclaus@esd.eu> */ #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/units.h> #include <linux/usb.h> MODULE_AUTHOR("Matthias Fuchs <socketcan@esd.eu>"); MODULE_AUTHOR("Frank Jungclaus <frank.jungclaus@esd.eu>"); MODULE_DESCRIPTION("CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro interfaces"); MODULE_LICENSE("GPL v2"); /* USB vendor and product ID */ #define ESD_USB_ESDGMBH_VENDOR_ID 0x0ab4 #define ESD_USB_CANUSB2_PRODUCT_ID 0x0010 #define ESD_USB_CANUSBM_PRODUCT_ID 0x0011 #define ESD_USB_CANUSB3_PRODUCT_ID 0x0014 /* CAN controller clock frequencies */ #define ESD_USB_2_CAN_CLOCK (60 * MEGA) /* Hz */ #define ESD_USB_M_CAN_CLOCK (36 * MEGA) /* Hz */ #define ESD_USB_3_CAN_CLOCK (80 * MEGA) /* Hz */ /* Maximum number of CAN nets */ #define ESD_USB_MAX_NETS 2 /* USB commands */ #define ESD_USB_CMD_VERSION 1 /* also used for VERSION_REPLY */ #define ESD_USB_CMD_CAN_RX 2 /* device to host only */ #define ESD_USB_CMD_CAN_TX 3 /* also used for TX_DONE */ #define ESD_USB_CMD_SETBAUD 4 /* also used for SETBAUD_REPLY */ #define ESD_USB_CMD_TS 5 /* also used for TS_REPLY */ #define ESD_USB_CMD_IDADD 6 /* also used for IDADD_REPLY */ /* esd CAN message flags - dlc field */ #define ESD_USB_RTR BIT(4) #define ESD_USB_NO_BRS BIT(4) #define ESD_USB_ESI BIT(5) #define ESD_USB_FD BIT(7) /* esd CAN message flags - id field */ #define ESD_USB_EXTID BIT(29) #define ESD_USB_EVENT BIT(30) #define ESD_USB_IDMASK GENMASK(28, 0) /* esd CAN event ids */ #define ESD_USB_EV_CAN_ERROR_EXT 2 /* CAN controller specific diagnostic data */ /* baudrate message flags */ #define ESD_USB_LOM BIT(30) /* Listen Only Mode */ #define ESD_USB_UBR BIT(31) /* User Bit Rate (controller BTR) in bits 0..27 */ #define ESD_USB_NO_BAUDRATE GENMASK(30, 0) /* bit rate unconfigured */ /* bit timing esd CAN-USB */ #define ESD_USB_2_TSEG1_SHIFT 16 #define ESD_USB_2_TSEG2_SHIFT 20 #define ESD_USB_2_SJW_SHIFT 14 #define ESD_USB_M_SJW_SHIFT 24 #define ESD_USB_TRIPLE_SAMPLES BIT(23) /* Transmitter Delay Compensation */ #define ESD_USB_3_TDC_MODE_AUTO 0 /* esd IDADD message */ #define ESD_USB_ID_ENABLE BIT(7) #define ESD_USB_MAX_ID_SEGMENT 64 /* SJA1000 ECC register (emulated by usb firmware) */ #define ESD_USB_SJA1000_ECC_SEG GENMASK(4, 0) #define ESD_USB_SJA1000_ECC_DIR BIT(5) #define ESD_USB_SJA1000_ECC_ERR BIT(2, 1) #define ESD_USB_SJA1000_ECC_BIT 0x00 #define ESD_USB_SJA1000_ECC_FORM BIT(6) #define ESD_USB_SJA1000_ECC_STUFF BIT(7) #define ESD_USB_SJA1000_ECC_MASK GENMASK(7, 6) /* esd bus state event codes */ #define ESD_USB_BUSSTATE_MASK GENMASK(7, 6) #define ESD_USB_BUSSTATE_WARN BIT(6) #define ESD_USB_BUSSTATE_ERRPASSIVE BIT(7) #define ESD_USB_BUSSTATE_BUSOFF GENMASK(7, 6) #define ESD_USB_RX_BUFFER_SIZE 1024 #define ESD_USB_MAX_RX_URBS 4 #define ESD_USB_MAX_TX_URBS 16 /* must be power of 2 */ /* Modes for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.mode */ #define ESD_USB_3_BAUDRATE_MODE_DISABLE 0 /* remove from bus */ #define ESD_USB_3_BAUDRATE_MODE_INDEX 1 /* ESD (CiA) bit rate idx */ #define ESD_USB_3_BAUDRATE_MODE_BTR_CTRL 2 /* BTR values (controller)*/ #define ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL 3 /* BTR values (canonical) */ #define ESD_USB_3_BAUDRATE_MODE_NUM 4 /* numerical bit rate */ #define ESD_USB_3_BAUDRATE_MODE_AUTOBAUD 5 /* autobaud */ /* Flags for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.flags */ #define ESD_USB_3_BAUDRATE_FLAG_FD BIT(0) /* enable CAN FD mode */ #define ESD_USB_3_BAUDRATE_FLAG_LOM BIT(1) /* enable listen only mode */ #define ESD_USB_3_BAUDRATE_FLAG_STM BIT(2) /* enable self test mode */ #define ESD_USB_3_BAUDRATE_FLAG_TRS BIT(3) /* enable triple sampling */ #define ESD_USB_3_BAUDRATE_FLAG_TXP BIT(4) /* enable transmit pause */ struct esd_usb_header_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd[2]; }; struct esd_usb_version_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd; u8 flags; __le32 drv_version; }; struct esd_usb_version_reply_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 nets; u8 features; __le32 version; u8 name[16]; __le32 rsvd; __le32 ts; }; struct esd_usb_rx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; __le32 ts; __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; struct { u8 status; /* CAN Controller Status */ u8 ecc; /* Error Capture Register */ u8 rec; /* RX Error Counter */ u8 tec; /* TX Error Counter */ } ev_can_err_ext; /* For ESD_EV_CAN_ERROR_EXT */ }; }; struct esd_usb_tx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; u32 hnd; /* opaque handle, not used by device */ __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; }; }; struct esd_usb_tx_done_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 status; u32 hnd; /* opaque handle, not used by device */ __le32 ts; }; struct esd_usb_id_filter_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 option; __le32 mask[ESD_USB_MAX_ID_SEGMENT + 1]; /* +1 for 29bit extended IDs */ }; struct esd_usb_set_baudrate_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; __le32 baud; }; /* CAN-USB/3 baudrate configuration, used for nominal as well as for data bit rate */ struct esd_usb_3_baudrate_cfg { __le16 brp; /* bit rate pre-scaler */ __le16 tseg1; /* time segment before sample point */ __le16 tseg2; /* time segment after sample point */ __le16 sjw; /* synchronization jump Width */ }; /* In principle, the esd CAN-USB/3 supports Transmitter Delay Compensation (TDC), * but currently only the automatic TDC mode is supported by this driver. * An implementation for manual TDC configuration will follow. * * For information about struct esd_usb_3_tdc_cfg, see * NTCAN Application Developers Manual, 6.2.25 NTCAN_TDC_CFG + related chapters * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_tdc_cfg { u8 tdc_mode; /* transmitter delay compensation mode */ u8 ssp_offset; /* secondary sample point offset in mtq */ s8 ssp_shift; /* secondary sample point shift in mtq */ u8 tdc_filter; /* TDC filter in mtq */ }; /* Extended version of the above set_baudrate_msg for a CAN-USB/3 * to define the CAN bit timing configuration of the CAN controller in * CAN FD mode as well as in Classical CAN mode. * * The payload of this command is a NTCAN_BAUDRATE_X structure according to * esd electronics gmbh, NTCAN Application Developers Manual, 6.2.15 NTCAN_BAUDRATE_X * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_set_baudrate_msg_x { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; /*reserved */ /* Payload ... */ __le16 mode; /* mode word, see ESD_USB_3_BAUDRATE_MODE_xxx */ __le16 flags; /* control flags, see ESD_USB_3_BAUDRATE_FLAG_xxx */ struct esd_usb_3_tdc_cfg tdc; /* TDC configuration */ struct esd_usb_3_baudrate_cfg nom; /* nominal bit rate */ struct esd_usb_3_baudrate_cfg data; /* data bit rate */ }; /* Main message type used between library and application */ union __packed esd_usb_msg { struct esd_usb_header_msg hdr; struct esd_usb_version_msg version; struct esd_usb_version_reply_msg version_reply; struct esd_usb_rx_msg rx; struct esd_usb_tx_msg tx; struct esd_usb_tx_done_msg txdone; struct esd_usb_set_baudrate_msg setbaud; struct esd_usb_3_set_baudrate_msg_x setbaud_x; struct esd_usb_id_filter_msg filter; }; static struct usb_device_id esd_usb_table[] = { {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB2_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSBM_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB3_PRODUCT_ID)}, {} }; MODULE_DEVICE_TABLE(usb, esd_usb_table); struct esd_usb_net_priv; struct esd_tx_urb_context { struct esd_usb_net_priv *priv; u32 echo_index; }; struct esd_usb { struct usb_device *udev; struct esd_usb_net_priv *nets[ESD_USB_MAX_NETS]; struct usb_anchor rx_submitted; int net_count; u32 version; int rxinitdone; void *rxbuf[ESD_USB_MAX_RX_URBS]; dma_addr_t rxbuf_dma[ESD_USB_MAX_RX_URBS]; }; struct esd_usb_net_priv { struct can_priv can; /* must be the first member */ atomic_t active_tx_jobs; struct usb_anchor tx_submitted; struct esd_tx_urb_context tx_contexts[ESD_USB_MAX_TX_URBS]; struct esd_usb *usb; struct net_device *netdev; int index; u8 old_state; struct can_berr_counter bec; }; static void esd_usb_rx_event(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct sk_buff *skb; u32 id = le32_to_cpu(msg->rx.id) & ESD_USB_IDMASK; if (id == ESD_USB_EV_CAN_ERROR_EXT) { u8 state = msg->rx.ev_can_err_ext.status; u8 ecc = msg->rx.ev_can_err_ext.ecc; priv->bec.rxerr = msg->rx.ev_can_err_ext.rec; priv->bec.txerr = msg->rx.ev_can_err_ext.tec; netdev_dbg(priv->netdev, "CAN_ERR_EV_EXT: dlc=%#02x state=%02x ecc=%02x rec=%02x tec=%02x\n", msg->rx.dlc, state, ecc, priv->bec.rxerr, priv->bec.txerr); /* if berr-reporting is off, only pass through on state change ... */ if (!(priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) && state == priv->old_state) return; skb = alloc_can_err_skb(priv->netdev, &cf); if (!skb) stats->rx_dropped++; if (state != priv->old_state) { enum can_state tx_state, rx_state; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; priv->old_state = state; switch (state & ESD_USB_BUSSTATE_MASK) { case ESD_USB_BUSSTATE_BUSOFF: new_state = CAN_STATE_BUS_OFF; can_bus_off(priv->netdev); break; case ESD_USB_BUSSTATE_WARN: new_state = CAN_STATE_ERROR_WARNING; break; case ESD_USB_BUSSTATE_ERRPASSIVE: new_state = CAN_STATE_ERROR_PASSIVE; break; default: new_state = CAN_STATE_ERROR_ACTIVE; priv->bec.txerr = 0; priv->bec.rxerr = 0; break; } if (new_state != priv->can.state) { tx_state = (priv->bec.txerr >= priv->bec.rxerr) ? new_state : 0; rx_state = (priv->bec.txerr <= priv->bec.rxerr) ? new_state : 0; can_change_state(priv->netdev, cf, tx_state, rx_state); } } else if (skb) { priv->can.can_stats.bus_error++; stats->rx_errors++; cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; switch (ecc & ESD_USB_SJA1000_ECC_MASK) { case ESD_USB_SJA1000_ECC_BIT: cf->data[2] |= CAN_ERR_PROT_BIT; break; case ESD_USB_SJA1000_ECC_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; break; case ESD_USB_SJA1000_ECC_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; break; default: break; } /* Error occurred during transmission? */ if (!(ecc & ESD_USB_SJA1000_ECC_DIR)) cf->data[2] |= CAN_ERR_PROT_TX; /* Bit stream position in CAN frame as the error was detected */ cf->data[3] = ecc & ESD_USB_SJA1000_ECC_SEG; } if (skb) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = priv->bec.txerr; cf->data[7] = priv->bec.rxerr; netif_rx(skb); } } } static void esd_usb_rx_can_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct canfd_frame *cfd; struct sk_buff *skb; u32 id; u8 len; if (!netif_device_present(priv->netdev)) return; id = le32_to_cpu(msg->rx.id); if (id & ESD_USB_EVENT) { esd_usb_rx_event(priv, msg); } else { if (msg->rx.dlc & ESD_USB_FD) { skb = alloc_canfd_skb(priv->netdev, &cfd); } else { skb = alloc_can_skb(priv->netdev, &cf); cfd = (struct canfd_frame *)cf; } if (skb == NULL) { stats->rx_dropped++; return; } cfd->can_id = id & ESD_USB_IDMASK; if (msg->rx.dlc & ESD_USB_FD) { /* masking by 0x0F is already done within can_fd_dlc2len() */ cfd->len = can_fd_dlc2len(msg->rx.dlc); len = cfd->len; if ((msg->rx.dlc & ESD_USB_NO_BRS) == 0) cfd->flags |= CANFD_BRS; if (msg->rx.dlc & ESD_USB_ESI) cfd->flags |= CANFD_ESI; } else { can_frame_set_cc_len(cf, msg->rx.dlc & ~ESD_USB_RTR, priv->can.ctrlmode); len = cf->len; if (msg->rx.dlc & ESD_USB_RTR) { cf->can_id |= CAN_RTR_FLAG; len = 0; } } if (id & ESD_USB_EXTID) cfd->can_id |= CAN_EFF_FLAG; memcpy(cfd->data, msg->rx.data_fd, len); stats->rx_bytes += len; stats->rx_packets++; netif_rx(skb); } } static void esd_usb_tx_done_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct net_device *netdev = priv->netdev; struct esd_tx_urb_context *context; if (!netif_device_present(netdev)) return; context = &priv->tx_contexts[msg->txdone.hnd & (ESD_USB_MAX_TX_URBS - 1)]; if (!msg->txdone.status) { stats->tx_packets++; stats->tx_bytes += can_get_echo_skb(netdev, context->echo_index, NULL); } else { stats->tx_errors++; can_free_echo_skb(netdev, context->echo_index, NULL); } /* Release context */ context->echo_index = ESD_USB_MAX_TX_URBS; atomic_dec(&priv->active_tx_jobs); netif_wake_queue(netdev); } static void esd_usb_read_bulk_callback(struct urb *urb) { struct esd_usb *dev = urb->context; int retval; int pos = 0; int i; switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: dev_info(dev->udev->dev.parent, "Rx URB aborted (%d)\n", urb->status); goto resubmit_urb; } while (pos < urb->actual_length) { union esd_usb_msg *msg; msg = (union esd_usb_msg *)(urb->transfer_buffer + pos); switch (msg->hdr.cmd) { case ESD_USB_CMD_CAN_RX: if (msg->rx.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_rx_can_msg(dev->nets[msg->rx.net], msg); break; case ESD_USB_CMD_CAN_TX: if (msg->txdone.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_tx_done_msg(dev->nets[msg->txdone.net], msg); break; } pos += msg->hdr.len * sizeof(u32); /* convert to # of bytes */ if (pos > urb->actual_length) { dev_err(dev->udev->dev.parent, "format error\n"); break; } } resubmit_urb: usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), urb->transfer_buffer, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval == -ENODEV) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) netif_device_detach(dev->nets[i]->netdev); } } else if (retval) { dev_err(dev->udev->dev.parent, "failed resubmitting read bulk urb: %d\n", retval); } } /* callback for bulk IN urb */ static void esd_usb_write_bulk_callback(struct urb *urb) { struct esd_tx_urb_context *context = urb->context; struct esd_usb_net_priv *priv; struct net_device *netdev; size_t size = sizeof(union esd_usb_msg); WARN_ON(!context); priv = context->priv; netdev = priv->netdev; /* free up our allocated buffer */ usb_free_coherent(urb->dev, size, urb->transfer_buffer, urb->transfer_dma); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "Tx URB aborted (%d)\n", urb->status); netif_trans_update(netdev); } static ssize_t firmware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 12) & 0xf, (dev->version >> 8) & 0xf, dev->version & 0xff); } static DEVICE_ATTR_RO(firmware); static ssize_t hardware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 28) & 0xf, (dev->version >> 24) & 0xf, (dev->version >> 16) & 0xff); } static DEVICE_ATTR_RO(hardware); static ssize_t nets_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d", dev->net_count); } static DEVICE_ATTR_RO(nets); static int esd_usb_send_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2), msg, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ &actual_length, 1000); } static int esd_usb_wait_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, 1), msg, sizeof(*msg), &actual_length, 1000); } static int esd_usb_setup_rx_urbs(struct esd_usb *dev) { int i, err = 0; if (dev->rxinitdone) return 0; for (i = 0; i < ESD_USB_MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf = NULL; dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { err = -ENOMEM; break; } buf = usb_alloc_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, GFP_KERNEL, &buf_dma); if (!buf) { dev_warn(dev->udev->dev.parent, "No memory left for USB buffer\n"); err = -ENOMEM; goto freeurb; } urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), buf, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &dev->rx_submitted); err = usb_submit_urb(urb, GFP_KERNEL); if (err) { usb_unanchor_urb(urb); usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, buf, urb->transfer_dma); goto freeurb; } dev->rxbuf[i] = buf; dev->rxbuf_dma[i] = buf_dma; freeurb: /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); if (err) break; } /* Did we submit any URBs */ if (i == 0) { dev_err(dev->udev->dev.parent, "couldn't setup read URBs\n"); return err; } /* Warn if we've couldn't transmit all the URBs */ if (i < ESD_USB_MAX_RX_URBS) { dev_warn(dev->udev->dev.parent, "rx performance may be slow\n"); } dev->rxinitdone = 1; return 0; } /* Start interface */ static int esd_usb_start(struct esd_usb_net_priv *priv) { struct esd_usb *dev = priv->usb; struct net_device *netdev = priv->netdev; union esd_usb_msg *msg; int err, i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } /* Enable all IDs * The IDADD message takes up to 64 32 bit bitmasks (2048 bits). * Each bit represents one 11 bit CAN identifier. A set bit * enables reception of the corresponding CAN identifier. A cleared * bit disabled this identifier. An additional bitmask value * following the CAN 2.0A bits is used to enable reception of * extended CAN frames. Only the LSB of this final mask is checked * for the complete 29 bit ID range. The IDADD message also allows * filter configuration for an ID subset. In this case you can add * the number of the starting bitmask (0..64) to the filter.option * field followed by only some bitmasks. */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32); /* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i < ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = cpu_to_le32(GENMASK(31, 0)); /* enable 29bit extended IDs */ msg->filter.mask[ESD_USB_MAX_ID_SEGMENT] = cpu_to_le32(BIT(0)); err = esd_usb_send_msg(dev, msg); if (err) goto out; err = esd_usb_setup_rx_urbs(dev); if (err) goto out; priv->can.state = CAN_STATE_ERROR_ACTIVE; out: if (err == -ENODEV) netif_device_detach(netdev); if (err) netdev_err(netdev, "couldn't start device: %d\n", err); kfree(msg); return err; } static void unlink_all_urbs(struct esd_usb *dev) { struct esd_usb_net_priv *priv; int i, j; usb_kill_anchored_urbs(&dev->rx_submitted); for (i = 0; i < ESD_USB_MAX_RX_URBS; ++i) usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, dev->rxbuf[i], dev->rxbuf_dma[i]); for (i = 0; i < dev->net_count; i++) { priv = dev->nets[i]; if (priv) { usb_kill_anchored_urbs(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (j = 0; j < ESD_USB_MAX_TX_URBS; j++) priv->tx_contexts[j].echo_index = ESD_USB_MAX_TX_URBS; } } } static int esd_usb_open(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); int err; /* common open */ err = open_candev(netdev); if (err) return err; /* finally start device */ err = esd_usb_start(priv); if (err) { netdev_warn(netdev, "couldn't start device: %d\n", err); close_candev(netdev); return err; } netif_start_queue(netdev); return 0; } static netdev_tx_t esd_usb_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); struct esd_usb *dev = priv->usb; struct esd_tx_urb_context *context = NULL; struct net_device_stats *stats = &netdev->stats; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; union esd_usb_msg *msg; struct urb *urb; u8 *buf; int i, err; int ret = NETDEV_TX_OK; size_t size = sizeof(union esd_usb_msg); if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { stats->tx_dropped++; dev_kfree_skb(skb); goto nourbmem; } buf = usb_alloc_coherent(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); stats->tx_dropped++; dev_kfree_skb(skb); goto nobufmem; } msg = (union esd_usb_msg *)buf; /* minimal length as # of 32bit words */ msg->hdr.len = offsetof(struct esd_usb_tx_msg, data) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_CAN_TX; msg->tx.net = priv->index; if (can_is_canfd_skb(skb)) { msg->tx.dlc = can_fd_len2dlc(cfd->len); msg->tx.dlc |= ESD_USB_FD; if ((cfd->flags & CANFD_BRS) == 0) msg->tx.dlc |= ESD_USB_NO_BRS; } else { msg->tx.dlc = can_get_cc_dlc((struct can_frame *)cfd, priv->can.ctrlmode); if (cfd->can_id & CAN_RTR_FLAG) msg->tx.dlc |= ESD_USB_RTR; } msg->tx.id = cpu_to_le32(cfd->can_id & CAN_ERR_MASK); if (cfd->can_id & CAN_EFF_FLAG) msg->tx.id |= cpu_to_le32(ESD_USB_EXTID); memcpy(msg->tx.data_fd, cfd->data, cfd->len); /* round up, then divide by 4 to add the payload length as # of 32bit words */ msg->hdr.len += DIV_ROUND_UP(cfd->len, sizeof(u32)); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) { if (priv->tx_contexts[i].echo_index == ESD_USB_MAX_TX_URBS) { context = &priv->tx_contexts[i]; break; } } /* This may never happen */ if (!context) { netdev_warn(netdev, "couldn't find free context\n"); ret = NETDEV_TX_BUSY; goto releasebuf; } context->priv = priv; context->echo_index = i; /* hnd must not be 0 - MSB is stripped in txdone handling */ msg->tx.hnd = BIT(31) | i; /* returned in TX done message */ usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ esd_usb_write_bulk_callback, context); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_jobs); /* Slow down tx path */ if (atomic_read(&priv->active_tx_jobs) >= ESD_USB_MAX_TX_URBS) netif_stop_queue(netdev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { can_free_echo_skb(netdev, context->echo_index, NULL); atomic_dec(&priv->active_tx_jobs); usb_unanchor_urb(urb); stats->tx_dropped++; if (err == -ENODEV) netif_device_detach(netdev); else netdev_warn(netdev, "failed tx_urb %d\n", err); goto releasebuf; } netif_trans_update(netdev); /* Release our reference to this URB, the USB core will eventually free * it entirely. */ usb_free_urb(urb); return NETDEV_TX_OK; releasebuf: usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); nobufmem: usb_free_urb(urb); nourbmem: return ret; } static int esd_usb_close(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); union esd_usb_msg *msg; int i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; /* Disable all IDs (see esd_usb_start()) */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32);/* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i <= ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = 0; if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending idadd message failed\n"); /* set CAN controller to reset mode */ msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(ESD_USB_NO_BAUDRATE); if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending setbaud message failed\n"); priv->can.state = CAN_STATE_STOPPED; netif_stop_queue(netdev); close_candev(netdev); kfree(msg); return 0; } static const struct net_device_ops esd_usb_netdev_ops = { .ndo_open = esd_usb_open, .ndo_stop = esd_usb_close, .ndo_start_xmit = esd_usb_start_xmit, .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops esd_usb_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const struct can_bittiming_const esd_usb_2_bittiming_const = { .name = "esd_usb_2", .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; static int esd_usb_2_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *btc = &esd_usb_2_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *bt = &priv->can.bittiming; union esd_usb_msg *msg; int err; u32 canbtr; int sjw_shift; canbtr = ESD_USB_UBR; if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) canbtr |= ESD_USB_LOM; canbtr |= (bt->brp - 1) & (btc->brp_max - 1); if (le16_to_cpu(priv->usb->udev->descriptor.idProduct) == ESD_USB_CANUSBM_PRODUCT_ID) sjw_shift = ESD_USB_M_SJW_SHIFT; else sjw_shift = ESD_USB_2_SJW_SHIFT; canbtr |= ((bt->sjw - 1) & (btc->sjw_max - 1)) << sjw_shift; canbtr |= ((bt->prop_seg + bt->phase_seg1 - 1) & (btc->tseg1_max - 1)) << ESD_USB_2_TSEG1_SHIFT; canbtr |= ((bt->phase_seg2 - 1) & (btc->tseg2_max - 1)) << ESD_USB_2_TSEG2_SHIFT; if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) canbtr |= ESD_USB_TRIPLE_SAMPLES; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(canbtr); netdev_dbg(netdev, "setting BTR=%#x\n", canbtr); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } /* Nominal bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.8 MCAN Nominal Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_nom_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 256, .tseg2_min = 2, .tseg2_max = 128, .sjw_max = 128, .brp_min = 1, .brp_max = 512, .brp_inc = 1, }; /* Data bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.4 MCAN Data Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_data_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 32, .tseg2_min = 1, .tseg2_max = 16, .sjw_max = 8, .brp_min = 1, .brp_max = 32, .brp_inc = 1, }; static int esd_usb_3_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *nom_btc = &esd_usb_3_nom_bittiming_const; const struct can_bittiming_const *data_btc = &esd_usb_3_data_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *nom_bt = &priv->can.bittiming; struct can_bittiming *data_bt = &priv->can.fd.data_bittiming; struct esd_usb_3_set_baudrate_msg_x *baud_x; union esd_usb_msg *msg; u16 flags = 0; int err; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; baud_x = &msg->setbaud_x; /* Canonical is the most reasonable mode for SocketCAN on CAN-USB/3 ... */ baud_x->mode = cpu_to_le16(ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL); if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= ESD_USB_3_BAUDRATE_FLAG_LOM; baud_x->nom.brp = cpu_to_le16(nom_bt->brp & (nom_btc->brp_max - 1)); baud_x->nom.sjw = cpu_to_le16(nom_bt->sjw & (nom_btc->sjw_max - 1)); baud_x->nom.tseg1 = cpu_to_le16((nom_bt->prop_seg + nom_bt->phase_seg1) & (nom_btc->tseg1_max - 1)); baud_x->nom.tseg2 = cpu_to_le16(nom_bt->phase_seg2 & (nom_btc->tseg2_max - 1)); if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { baud_x->data.brp = cpu_to_le16(data_bt->brp & (data_btc->brp_max - 1)); baud_x->data.sjw = cpu_to_le16(data_bt->sjw & (data_btc->sjw_max - 1)); baud_x->data.tseg1 = cpu_to_le16((data_bt->prop_seg + data_bt->phase_seg1) & (data_btc->tseg1_max - 1)); baud_x->data.tseg2 = cpu_to_le16(data_bt->phase_seg2 & (data_btc->tseg2_max - 1)); flags |= ESD_USB_3_BAUDRATE_FLAG_FD; } /* Currently this driver only supports the automatic TDC mode */ baud_x->tdc.tdc_mode = ESD_USB_3_TDC_MODE_AUTO; baud_x->tdc.ssp_offset = 0; baud_x->tdc.ssp_shift = 0; baud_x->tdc.tdc_filter = 0; baud_x->flags = cpu_to_le16(flags); baud_x->net = priv->index; baud_x->rsvd = 0; /* set len as # of 32bit words */ msg->hdr.len = sizeof(struct esd_usb_3_set_baudrate_msg_x) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_SETBAUD; netdev_dbg(netdev, "ctrlmode=%#x/%#x, esd-net=%u, esd-mode=%#x, esd-flags=%#x\n", priv->can.ctrlmode, priv->can.ctrlmode_supported, priv->index, le16_to_cpu(baud_x->mode), flags); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } static int esd_usb_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct esd_usb_net_priv *priv = netdev_priv(netdev); bec->txerr = priv->bec.txerr; bec->rxerr = priv->bec.rxerr; return 0; } static int esd_usb_set_mode(struct net_device *netdev, enum can_mode mode) { switch (mode) { case CAN_MODE_START: netif_wake_queue(netdev); break; default: return -EOPNOTSUPP; } return 0; } static int esd_usb_probe_one_net(struct usb_interface *intf, int index) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; struct esd_usb_net_priv *priv; int err = 0; int i; netdev = alloc_candev(sizeof(*priv), ESD_USB_MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "couldn't alloc candev\n"); err = -ENOMEM; goto done; } priv = netdev_priv(netdev); init_usb_anchor(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = ESD_USB_MAX_TX_URBS; priv->usb = dev; priv->netdev = netdev; priv->index = index; priv->can.state = CAN_STATE_STOPPED; priv->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_CC_LEN8_DLC | CAN_CTRLMODE_BERR_REPORTING; switch (le16_to_cpu(dev->udev->descriptor.idProduct)) { case ESD_USB_CANUSB3_PRODUCT_ID: priv->can.clock.freq = ESD_USB_3_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD; priv->can.bittiming_const = &esd_usb_3_nom_bittiming_const; priv->can.fd.data_bittiming_const = &esd_usb_3_data_bittiming_const; priv->can.do_set_bittiming = esd_usb_3_set_bittiming; priv->can.fd.do_set_data_bittiming = esd_usb_3_set_bittiming; break; case ESD_USB_CANUSBM_PRODUCT_ID: priv->can.clock.freq = ESD_USB_M_CAN_CLOCK; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; case ESD_USB_CANUSB2_PRODUCT_ID: default: priv->can.clock.freq = ESD_USB_2_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; } priv->can.do_set_mode = esd_usb_set_mode; priv->can.do_get_berr_counter = esd_usb_get_berr_counter; netdev->flags |= IFF_ECHO; /* we support local echo */ netdev->netdev_ops = &esd_usb_netdev_ops; netdev->ethtool_ops = &esd_usb_ethtool_ops; SET_NETDEV_DEV(netdev, &intf->dev); netdev->dev_id = index; err = register_candev(netdev); if (err) { dev_err(&intf->dev, "couldn't register CAN device: %d\n", err); free_candev(netdev); err = -ENOMEM; goto done; } dev->nets[index] = priv; netdev_info(netdev, "device %s registered\n", netdev->name); done: return err; } /* probe function for new USB devices * * check version information and number of available * CAN interfaces */ static int esd_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct esd_usb *dev; union esd_usb_msg *msg; int i, err; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto done; } dev->udev = interface_to_usbdev(intf); init_usb_anchor(&dev->rx_submitted); usb_set_intfdata(intf, dev); msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto free_msg; } /* query number of CAN interfaces (nets) */ msg->hdr.cmd = ESD_USB_CMD_VERSION; msg->hdr.len = sizeof(struct esd_usb_version_msg) / sizeof(u32); /* # of 32bit words */ msg->version.rsvd = 0; msg->version.flags = 0; msg->version.drv_version = 0; err = esd_usb_send_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "sending version message failed\n"); goto free_msg; } err = esd_usb_wait_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "no version message answer\n"); goto free_msg; } dev->net_count = (int)msg->version_reply.nets; dev->version = le32_to_cpu(msg->version_reply.version); if (device_create_file(&intf->dev, &dev_attr_firmware)) dev_err(&intf->dev, "Couldn't create device file for firmware\n"); if (device_create_file(&intf->dev, &dev_attr_hardware)) dev_err(&intf->dev, "Couldn't create device file for hardware\n"); if (device_create_file(&intf->dev, &dev_attr_nets)) dev_err(&intf->dev, "Couldn't create device file for nets\n"); /* do per device probing */ for (i = 0; i < dev->net_count; i++) esd_usb_probe_one_net(intf, i); free_msg: kfree(msg); if (err) kfree(dev); done: return err; } /* called by the usb core when the device is removed from the system */ static void esd_usb_disconnect(struct usb_interface *intf) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; int i; device_remove_file(&intf->dev, &dev_attr_firmware); device_remove_file(&intf->dev, &dev_attr_hardware); device_remove_file(&intf->dev, &dev_attr_nets); usb_set_intfdata(intf, NULL); if (dev) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) { netdev = dev->nets[i]->netdev; unregister_netdev(netdev); free_candev(netdev); } } unlink_all_urbs(dev); kfree(dev); } } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver esd_usb_driver = { .name = KBUILD_MODNAME, .probe = esd_usb_probe, .disconnect = esd_usb_disconnect, .id_table = esd_usb_table, }; module_usb_driver(esd_usb_driver);
186 185 629 580 5 4 1 2 7 15 1 7 5 612 573 577 304 23 17 8 2 292 268 1 24 266 2 268 22 91 208 117 117 116 1 119 118 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig) set_bit(IMA_DIGSIG, &iint->atomic_flags); else clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), 0); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), 0); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
491 492 502 492 499 4 4 4 4 4 507 503 350 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 // SPDX-License-Identifier: GPL-2.0-only /* * 9P entry point * * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kmod.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/moduleparam.h> #include <net/9p/9p.h> #include <linux/fs.h> #include <linux/parser.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/list.h> #include <linux/spinlock.h> #ifdef CONFIG_NET_9P_DEBUG unsigned int p9_debug_level; /* feature-rific global debug level */ EXPORT_SYMBOL(p9_debug_level); module_param_named(debug, p9_debug_level, uint, 0); MODULE_PARM_DESC(debug, "9P debugging level"); void _p9_debug(enum p9_debug_flags level, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; if ((p9_debug_level & level) != level) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (level == P9_DEBUG_9P) pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf); else pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf); va_end(args); } EXPORT_SYMBOL(_p9_debug); #endif /* Dynamic Transport Registration Routines */ static DEFINE_SPINLOCK(v9fs_trans_lock); static LIST_HEAD(v9fs_trans_list); /** * v9fs_register_trans - register a new transport with 9p * @m: structure describing the transport module and entry points * */ void v9fs_register_trans(struct p9_trans_module *m) { spin_lock(&v9fs_trans_lock); list_add_tail(&m->list, &v9fs_trans_list); spin_unlock(&v9fs_trans_lock); } EXPORT_SYMBOL(v9fs_register_trans); /** * v9fs_unregister_trans - unregister a 9p transport * @m: the transport to remove * */ void v9fs_unregister_trans(struct p9_trans_module *m) { spin_lock(&v9fs_trans_lock); list_del_init(&m->list); spin_unlock(&v9fs_trans_lock); } EXPORT_SYMBOL(v9fs_unregister_trans); static struct p9_trans_module *_p9_get_trans_by_name(const char *s) { struct p9_trans_module *t, *found = NULL; spin_lock(&v9fs_trans_lock); list_for_each_entry(t, &v9fs_trans_list, list) if (strcmp(t->name, s) == 0 && try_module_get(t->owner)) { found = t; break; } spin_unlock(&v9fs_trans_lock); return found; } /** * v9fs_get_trans_by_name - get transport with the matching name * @s: string identifying transport * */ struct p9_trans_module *v9fs_get_trans_by_name(const char *s) { struct p9_trans_module *found = NULL; found = _p9_get_trans_by_name(s); #ifdef CONFIG_MODULES if (!found) { request_module("9p-%s", s); found = _p9_get_trans_by_name(s); } #endif return found; } EXPORT_SYMBOL(v9fs_get_trans_by_name); static const char * const v9fs_default_transports[] = { "virtio", "tcp", "fd", "unix", "xen", "rdma", }; /** * v9fs_get_default_trans - get the default transport * */ struct p9_trans_module *v9fs_get_default_trans(void) { struct p9_trans_module *t, *found = NULL; int i; spin_lock(&v9fs_trans_lock); list_for_each_entry(t, &v9fs_trans_list, list) if (t->def && try_module_get(t->owner)) { found = t; break; } if (!found) list_for_each_entry(t, &v9fs_trans_list, list) if (try_module_get(t->owner)) { found = t; break; } spin_unlock(&v9fs_trans_lock); for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++) found = v9fs_get_trans_by_name(v9fs_default_transports[i]); return found; } EXPORT_SYMBOL(v9fs_get_default_trans); /** * v9fs_put_trans - put trans * @m: transport to put * */ void v9fs_put_trans(struct p9_trans_module *m) { if (m) module_put(m->owner); } /** * init_p9 - Initialize module * */ static int __init init_p9(void) { int ret; ret = p9_client_init(); if (ret) return ret; p9_error_init(); pr_info("Installing 9P2000 support\n"); return ret; } /** * exit_p9 - shutdown module * */ static void __exit exit_p9(void) { pr_info("Unloading 9P2000 support\n"); p9_client_exit(); } module_init(init_p9) module_exit(exit_p9) MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)");
46 27 27 27 2 47 51 52 46 13 44 52 8 49 49 49 1 12 6 47 15 2 2 2 2 48 1 3 28 3 28 11 32 32 12 20 17 27 12 27 1 26 12 18 12 12 38 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; unsigned int first_hole; unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; blocks_per_folio = folio_size(folio) >> blkbits; first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map.m_pblk; else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); }
76 7 7 7 7 1 7 7 25 76 74 65 9 11 3 14 1 12 8 74 13 71 3 18 3 21 10 83 88 5 6 16 50 24 8 8 8 57 38 39 21 3 21 37 10 13 3 4 7 5 4 1 1 1 4 2 2 42 43 144 175 16 64 35 20 3 18 3 3 46 46 22 22 24 46 46 45 38 7 5 2 43 1 3 45 2 1 2 2 66 6 8 4 51 8 4 1 4 50 50 6 39 39 5 39 5 3 2 2 2 2 2 1 1 1 2 2 1 1 2 2 2 48 3 46 44 1 1 45 45 45 53 53 3 48 1 49 119 8 112 7 111 119 92 25 22 7 6 6 6 4 3 2 2 26 46 36 1 3 1 1 1 31 25 6 1 2 5 26 14 1 3 1 1 1 24 12 5 22 24 4 371 17 22 6 129 7 109 116 116 116 106 10 79 68 11 72 1 65 6 37 34 25 45 70 2 68 3 67 4 68 3 61 10 59 12 65 6 65 65 56 9 11 51 41 12 14 6 1 1 7 7 4 2 1 3 2 4 1 3 184 249 2 4 7 2 3 3 2 4 1 4 4 239 20 366 43 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool strict) { /* For the default ip6tnl0 device, allow changing only the protocol * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other * parameters are 0). */ if (strict && (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) || p->flags != t->parms.flags || p->hop_limit || p->encap_limit || p->flowinfo || p->link || p->fwmark || p->collect_md)) return -EINVAL; t->parms.proto = p->proto; netdev_state_change(t->dev); return 0; } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1, false); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) { if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so * let's ignore this flag. */ ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6; if (memchr_inv(&ipencap, 0, sizeof(ipencap))) { NL_SET_ERR_MSG(extack, "Only protocol can be changed for fallback tunnel, not encap params"); return -EINVAL; } } ip6_tnl_netlink_parms(data, &p); if (ip6_tnl0_update(t, &p, true) < 0) { NL_SET_ERR_MSG(extack, "Only protocol can be changed for fallback tunnel"); return -EINVAL; } return 0; } if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_dtree.c: directory B+-tree manager * * B+-tree with variable length key directory: * * each directory page is structured as an array of 32-byte * directory entry slots initialized as a freelist * to avoid search/compaction of free space at insertion. * when an entry is inserted, a number of slots are allocated * from the freelist as required to store variable length data * of the entry; when the entry is deleted, slots of the entry * are returned to freelist. * * leaf entry stores full name as key and file serial number * (aka inode number) as data. * internal/router entry stores sufffix compressed name * as key and simple extent descriptor as data. * * each directory page maintains a sorted entry index table * which stores the start slot index of sorted entries * to allow binary search on the table. * * directory starts as a root/leaf page in on-disk inode * inline data area. * when it becomes full, it starts a leaf of a external extent * of length of 1 block. each time the first leaf becomes full, * it is extended rather than split (its size is doubled), * until its length becoms 4 KBytes, from then the extent is split * with new 4 Kbyte extent when it becomes full * to reduce external fragmentation of small directories. * * blah, blah, blah, for linear scan of directory in pieces by * readdir(). * * * case-insensitive directory file system * * names are stored in case-sensitive way in leaf entry. * but stored, searched and compared in case-insensitive (uppercase) order * (i.e., both search key and entry key are folded for search/compare): * (note that case-sensitive order is BROKEN in storage, e.g., * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad * * entries which folds to the same key makes up a equivalent class * whose members are stored as contiguous cluster (may cross page boundary) * but whose order is arbitrary and acts as duplicate, e.g., * abc, Abc, aBc, abC) * * once match is found at leaf, requires scan forward/backward * either for, in case-insensitive search, duplicate * or for, in case-sensitive search, for exact match * * router entry must be created/stored in case-insensitive way * in internal entry: * (right most key of left page and left most key of right page * are folded, and its suffix compression is propagated as router * key in parent) * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> * should be made the router key for the split) * * case-insensitive search: * * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; * for leaf entry, fold the entry key before comparison. * * if (leaf entry case-insensitive match found) * if (next entry satisfies case-insensitive match) * return EDUPLICATE; * if (prev entry satisfies case-insensitive match) * return EDUPLICATE; * return match; * else * return no match; * * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * * log based recovery: */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_debug.h" /* dtree split parameter */ struct dtsplit { struct metapage *mp; s16 index; s16 nslot; struct component_name *key; ddata_t *data; struct pxdlist *pxdlist; }; #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) /* * forward references */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp); static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack); static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); static int dtReadFirst(struct inode *ip, struct btstack * btstack); static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack); static int dtCompare(struct component_name * key, dtpage_t * p, int si); static int ciCompare(struct component_name * key, dtpage_t * p, int si, int flag); static void dtGetKey(dtpage_t * p, int i, struct component_name * key, int flag); static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag); static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock **); static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index); static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); #define ciToUpper(c) UniStrupr((c)->name) /* * read_index_page() * * Reads a page of a directory's index table. * Having metadata mapped into the directory inode's address space * presents a multitude of problems. We avoid this by mapping to * the absolute address space outside of the *_metapage routines */ static struct metapage *read_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return read_metapage(inode, xaddr, PSIZE, 1); } /* * get_index_page() * * Same as get_index_page(), but get's a new page without reading */ static struct metapage *get_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return get_metapage(inode, xaddr, PSIZE, 1); } /* * find_index() * * Returns dtree page containing directory table entry for specified * index and pointer to its entry. * * mp must be released by caller. */ static struct dir_table_slot *find_index(struct inode *ip, u32 index, struct metapage ** mp, s64 *lblock) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); s64 blkno; s64 offset; int page_offset; struct dir_table_slot *slot; static int maxWarnings = 10; if (index < 2) { if (maxWarnings) { jfs_warn("find_entry called with index = %d", index); maxWarnings--; } return NULL; } if (index >= jfs_ip->next_index) { jfs_warn("find_entry called with index >= next_index"); return NULL; } if (jfs_dirtable_inline(ip)) { /* * Inline directory table */ *mp = NULL; slot = &jfs_ip->i_dirtable[index - 2]; } else { offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << JFS_SBI(ip->i_sb)->l2nbperpage; if (*mp && (*lblock != blkno)) { release_metapage(*mp); *mp = NULL; } if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } slot = (struct dir_table_slot *) ((char *) (*mp)->data + page_offset); } return slot; } static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, u32 index) { struct tlock *tlck; struct linelock *llck; struct lv *lv; tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) tlck->lock; if (llck->index >= llck->maxcnt) llck = txLinelock(llck); lv = &llck->lv[llck->index]; /* * Linelock slot size is twice the size of directory table * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; llck->index++; } /* * add_index() * * Adds an entry to the directory index table. This is used to provide * each directory entry with a persistent index in which to resume * directory traversals */ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) { struct super_block *sb = ip->i_sb; struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); u64 blkno; struct dir_table_slot *dirtab_slot; u32 index; struct linelock *llck; struct lv *lv; struct metapage *mp; s64 offset; uint page_offset; struct tlock *tlck; s64 xaddr; ASSERT(DO_INDEX(ip)); if (jfs_ip->next_index < 2) { jfs_warn("add_index: next_index = %d. Resetting!", jfs_ip->next_index); jfs_ip->next_index = 2; } index = jfs_ip->next_index++; if (index <= MAX_INLINE_DIRTABLE_ENTRY) { /* * i_size reflects size of index table, or 8 bytes per entry. */ ip->i_size = (loff_t) (index - 1) << 3; /* * dir table fits inline within inode */ dirtab_slot = &jfs_ip->i_dirtable[index-2]; dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); set_cflag(COMMIT_Dirtable, ip); return index; } if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { struct dir_table_slot temp_table[12]; /* * It's time to move the inline table to an external * page and begin to build the xtree */ if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { dquot_free_block(ip, sbi->nbperpage); goto clean_up; } /* * Save the table, we're going to overwrite it with the * xtree root */ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); /* * Initialize empty x-tree */ xtInitRoot(tid, ip); /* * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; mp = get_index_page(ip, 0); if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); goto clean_up; } tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) & tlck->lock; ASSERT(llck->index == 0); lv = &llck->lv[0]; lv->offset = 0; lv->length = 6; /* tlckDATA slot size is 16 bytes */ llck->index++; memcpy(mp->data, temp_table, sizeof(temp_table)); mark_metapage_dirty(mp); release_metapage(mp); /* * Logging is now directed by xtree tlocks */ clear_cflag(COMMIT_Dirtable, ip); } offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; if (page_offset == 0) { /* * This will be the beginning of a new page */ xaddr = 0; if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { jfs_warn("add_index: xtInsert failed!"); goto clean_up; } ip->i_size += PSIZE; if ((mp = get_index_page(ip, blkno))) memset(mp->data, 0, PSIZE); /* Just looks better */ else xtTruncate(tid, ip, offset, COMMIT_PWMAP); } else mp = read_index_page(ip, blkno); if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } lock_index(tid, ip, mp, index); dirtab_slot = (struct dir_table_slot *) ((char *) mp->data + page_offset); dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); mark_metapage_dirty(mp); release_metapage(mp); return index; clean_up: jfs_ip->next_index--; return 0; } /* * free_index() * * Marks an entry to the directory index table as free. */ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) { struct dir_table_slot *dirtab_slot; s64 lblock; struct metapage *mp = NULL; dirtab_slot = find_index(ip, index, &mp, &lblock); if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; dirtab_slot->slot = dirtab_slot->addr1 = 0; dirtab_slot->addr2 = cpu_to_le32(next); if (mp) { lock_index(tid, ip, mp, index); mark_metapage_dirty(mp); release_metapage(mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * modify_index() * * Changes an entry in the directory index table */ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, int slot, struct metapage ** mp, s64 *lblock) { struct dir_table_slot *dirtab_slot; dirtab_slot = find_index(ip, index, mp, lblock); if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); dirtab_slot->slot = slot; if (*mp) { lock_index(tid, ip, *mp, index); mark_metapage_dirty(*mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * read_index() * * reads a directory table slot */ static int read_index(struct inode *ip, u32 index, struct dir_table_slot * dirtab_slot) { s64 lblock; struct metapage *mp = NULL; struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); if (!slot) { return -EIO; } memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); if (mp) release_metapage(mp); return 0; } /* * dtSearch() * * function: * Search for the entry with specified key * * parameter: * * return: 0 - search result on stack, leaf page pinned; * errno - I/O error */ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct btstack * btstack, int flag) { int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; struct metapage *mp; dtpage_t *p; s8 *stbl; int base, index, lim; struct btframe *btsp; pxd_t *pxd; int psize = 288; /* initial in-line directory */ ino_t inumber; struct component_name ciKey; struct super_block *sb = ip->i_sb; ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } /* uppercase search key for c-i directory */ UniStrcpy(ciKey.name, key->name); ciKey.namlen = key->namlen; /* only uppercase if case-insensitive support is on */ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { ciToUpper(&ciKey); } BT_CLR(btstack); /* reset stack */ /* init level count for max pages to split */ btstack->nsplit = 1; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) goto dtSearch_Exit1; /* get sorted entry table of the page */ stbl = DT_GETSTBL(p); /* * binary search with search key K on the current page. */ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); if (stbl[index] < 0) { rc = -EIO; goto out; } if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = ciCompare(&ciKey, p, stbl[index], JFS_SBI(sb)->mntflag); } else { /* router key is in uppercase */ cmp = dtCompare(&ciKey, p, stbl[index]); } if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { inumber = le32_to_cpu( ((struct ldtentry *) & p->slot[stbl[index]])->inumber); /* * search for JFS_LOOKUP */ if (flag == JFS_LOOKUP) { *data = inumber; rc = 0; goto out; } /* * search for JFS_CREATE */ if (flag == JFS_CREATE) { *data = inumber; rc = -EEXIST; goto out; } /* * search for JFS_REMOVE or JFS_RENAME */ if ((flag == JFS_REMOVE || flag == JFS_RENAME) && *data != inumber) { rc = -ESTALE; goto out; } /* * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME */ /* save search result */ *data = inumber; btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* search hit - internal page: * descend/search its child page */ goto getChild; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. */ /* * search miss - leaf page * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { /* * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME */ if (flag == JFS_LOOKUP || flag == JFS_REMOVE || flag == JFS_RENAME) { rc = -ENOENT; goto out; } /* * search for JFS_CREATE|JFS_FINDDIR: * * save search result */ *data = 0; btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* * search miss - internal page * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ getChild: /* update max. number of pages to split */ if (BT_STACK_FULL(btstack)) { /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; } btstack->nsplit++; /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, index); /* get the child page block number */ pxd = (pxd_t *) & p->slot[stbl[index]]; bn = addressPXD(pxd); psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } out: DT_PUTPAGE(mp); dtSearch_Exit1: kfree(ciKey.name); dtSearch_Exit2: return rc; } /* * dtInsert() * * function: insert an entry to directory tree * * parameter: * * return: 0 - success; * errno - failure; */ int dtInsert(tid_t tid, struct inode *ip, struct component_name * name, ino_t * fsn, struct btstack * btstack) { int rc = 0; struct metapage *mp; /* meta-page buffer */ dtpage_t *p; /* base B+-tree index page */ s64 bn; int index; struct dtsplit split; /* split information */ ddata_t data; struct dt_lock *dtlck; int n; struct tlock *tlck; struct lv *lv; /* * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); if (p->header.freelist == 0) return -EINVAL; /* * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { DT_PUTPAGE(mp); return -EMLINK; } n = NDTLEAF(name->namlen); data.leaf.tid = tid; data.leaf.ip = ip; } else { n = NDTLEAF_LEGACY(name->namlen); data.leaf.ip = NULL; /* signifies legacy directory format */ } data.leaf.ino = *fsn; /* * leaf page does not have enough room for new entry: * * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ if (n > p->header.freecnt) { split.mp = mp; split.index = index; split.nslot = n; split.key = name; split.data = &data; rc = dtSplitUp(tid, ip, &split, btstack); return rc; } /* * leaf page does have enough room for new entry: * * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; dtInsertEntry(p, index, name, &data, &dtlck); /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; n = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + n; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } /* * dtSplitUp() * * function: propagate insertion bottom up; * * parameter: * * return: 0 - success; * errno - failure; * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int rc = 0; struct metapage *smp; dtpage_t *sp; /* split page */ struct metapage *rmp; dtpage_t *rp; /* new right page split from sp */ pxd_t rpxd; /* new right page extent descriptor */ struct metapage *lmp; dtpage_t *lp; /* left child page */ int skip; /* index of entry of insertion */ struct btframe *parent; /* parent page entry on traverse stack */ s64 xaddr, nxaddr; int xlen, xsize; struct pxdlist pxdlist; pxd_t *pxd; struct component_name key = { 0, NULL }; ddata_t *data = split->data; int n; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int quota_allocation = 0; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; } /* * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* * allocate a single extent child page */ xlen = 1; n = sbi->bsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ if (n <= split->nslot) xlen++; if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { DT_PUTPAGE(smp); goto freeKeyName; } pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); split->pxdlist = &pxdlist; rc = dtSplitRoot(tid, ip, split, &rmp); if (rc) dbFree(ip, xaddr, xlen); else DT_PUTPAGE(rmp); DT_PUTPAGE(smp); if (!DO_INDEX(ip)) ip->i_size = xlen << sbi->l2bsize; goto freeKeyName; } /* * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) */ pxd = &sp->header.self; xlen = lengthPXD(pxd); xsize = xlen << sbi->l2bsize; if (xsize < PSIZE) { xaddr = addressPXD(pxd); n = xsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ if ((n + sp->header.freecnt) <= split->nslot) n = xlen + (xlen << 1); else n = xlen; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, n); if (rc) goto extendOut; quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, (s64) n, &nxaddr))) goto extendOut; pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, nxaddr); PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); if (xaddr != nxaddr) { /* free relocated extent */ xlen = lengthPXD(pxd); dbFree(ip, nxaddr, (s64) xlen); } else { /* free extended delta */ xlen = lengthPXD(pxd) - n; xaddr = addressPXD(pxd) + xlen; dbFree(ip, xaddr, (s64) n); } } else if (!DO_INDEX(ip)) ip->i_size = lengthPXD(pxd) << sbi->l2bsize; extendOut: DT_PUTPAGE(smp); goto freeKeyName; } /* * split leaf page <sp> into <sp> and a new right page <rp>. * * return <rp> pinned and its extent descriptor <rpxd> */ /* * allocate new directory page extent and * new index page(s) to cover page split(s) * * allocation hint: ? */ n = btstack->nsplit; pxdlist.maxnpxd = pxdlist.npxd = 0; xlen = sbi->nbperpage; for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } split->pxdlist = &pxdlist; if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } if (!DO_INDEX(ip)) ip->i_size += PSIZE; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 4 pages pinned at any time: * two children, left parent and right parent (when the parent splits). * keep the child pages pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages (<lp>, <rp>) pinned */ lmp = smp; lp = sp; /* * insert router entry in parent for new right child page <rp> */ /* get the parent page <sp> */ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); goto splitOut; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * compute the key for the router entry * * key suffix compression: * for internal pages that have leaf pages as children, * retain only what's needed to distinguish between * the new entry and the entry on the page to its left. * If the keys compare equal, retain the entire key. * * note that compression is performed only at computing * router key at the lowest internal level. * further compression of the key between pairs of higher * level internal pages loses too much information and * the search may fail. * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} * results in two adjacent parent entries (a)(xx). * if split occurs between these two entries, and * if compression is applied, the router key of parent entry * of right page (x) will divert search for x into right * subtree and miss x in the left subtree.) * * the entire key must be retained for the next-to-leftmost * internal key at any level of the tree, or search may fail * (e.g., ?) */ switch (rp->header.flag & BT_TYPE) { case BT_LEAF: /* * compute the length of prefix for suffix compression * between last entry of left page and first entry * of right page */ if ((sp->header.flag & BT_ROOT && skip > 1) || sp->header.prev != 0 || skip > 1) { /* compute uppercase router prefix key */ rc = ciGetLeafPrefixKey(lp, lp->header.nextindex-1, rp, 0, &key, sbi->mntflag); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); DT_PUTPAGE(smp); goto splitOut; } } else { /* next to leftmost entry of lowest internal level */ /* compute uppercase router key */ dtGetKey(rp, 0, &key, sbi->mntflag); key.name[key.namlen] = 0; if ((sbi->mntflag & JFS_OS2) == JFS_OS2) ciToUpper(&key); } n = NDTINTERNAL(key.namlen); break; case BT_INTERNAL: dtGetKey(rp, 0, &key, sbi->mntflag); n = NDTINTERNAL(key.namlen); break; default: jfs_err("dtSplitUp(): UFO!"); break; } /* unpin left child page */ DT_PUTPAGE(lmp); /* * compute the data for the router entry */ data->xd = rpxd; /* child page xd */ /* * parent page is full - split the parent page */ if (n > sp->header.freecnt) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->nslot = n; split->key = &key; /* split->data = data; */ /* unpin right child page */ DT_PUTPAGE(rmp); /* The split routines insert the new entry, * acquire txLock as appropriate. * return <rp> pinned and its block number <rbn>. */ rc = (sp->header.flag & BT_ROOT) ? dtSplitRoot(tid, ip, split, &rmp) : dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); if (rc) { DT_PUTPAGE(smp); goto splitOut; } /* smp and rmp are pinned */ } /* * parent page is not full - insert router entry in parent page */ else { BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the parent page */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root parent page */ if (!(sp->header.flag & BT_ROOT)) { lv++; n = skip >> L2DTSLOTSIZE; lv->offset = sp->header.stblindex + n; lv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } dtInsertEntry(sp, skip, &key, data, &dtlck); /* exit propagate up */ break; } } /* unpin current split and its right page */ DT_PUTPAGE(smp); DT_PUTPAGE(rmp); /* * free remaining extents allocated for split */ splitOut: n = pxdlist.npxd; pxd = &pxdlist.pxd[n]; for (; n < pxdlist.maxnpxd; n++, pxd++) dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); freeKeyName: kfree(key.name); /* Rollback quota allocation */ if (rc && quota_allocation) dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: return rc; } /* * dtSplitPage() * * function: Split a non-root page of a btree. * * parameter: * * return: 0 - success; * errno - failure; * return split and new page pinned; */ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) { int rc = 0; struct metapage *smp; dtpage_t *sp; struct metapage *rmp; dtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; dtpage_t *p; s64 nextbn; struct pxdlist *pxdlist; pxd_t *pxd; int skip, nextindex, half, left, nxt, off, si; struct ldtentry *ldtentry; struct idtentry *idtentry; u8 *stbl; struct dtslot *f; int fsi, stblsize; int n; struct dt_lock *sdtlck, *rdtlck; struct tlock *tlck; struct dt_lock *dtlck; struct lv *slv, *rlv, *lv; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); /* * allocate the new right page for the split */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); rdtlck = (struct dt_lock *) & tlck->lock; rp = (dtpage_t *) rmp->data; *rpp = rp; rp->header.self = *pxd; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the split page * * action: */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); sdtlck = (struct dt_lock *) & tlck->lock; /* linelock header of split page */ ASSERT(sdtlck->index == 0); slv = & sdtlck->lv[0]; slv->offset = 0; slv->length = 1; sdtlck->index++; /* * initialize/update sibling pointers between sp and rp */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); /* * initialize new right page */ rp->header.flag = sp->header.flag; /* compute sorted entry table at start of extent data area */ rp->header.nextindex = 0; rp->header.stblindex = 1; n = PSIZE >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ /* init freelist */ fsi = rp->header.stblindex + stblsize; rp->header.freelist = fsi; rp->header.freecnt = rp->header.maxslot - fsi; /* * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. Adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * If we're wrong it's no big deal, we'll just do the split the right * way next time. * (It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, * but it's not. Be my guest.) */ if (nextbn == 0 && split->index == sp->header.nextindex) { /* linelock header + stbl (first slot) of new page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 2; rdtlck->index++; /* * initialize freelist of new right page */ f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* insert entry at the first entry of the new right page */ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); goto out; } /* * non-sequential insert (at possibly middle page) */ /* * update prev pointer of previous right sibling page; */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { discard_metapage(rmp); return rc; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header of previous right sibling page */ lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(rbn); DT_PUTPAGE(mp); } /* * split the data between the split and right pages. */ skip = split->index; half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ left = 0; /* * compute fill factor for split pages * * <nxt> traces the next entry to move to rp * <off> traces the next entry to stay in sp */ stbl = (u8 *) & sp->slot[sp->header.stblindex]; nextindex = sp->header.nextindex; for (nxt = off = 0; nxt < nextindex; ++off) { if (off == skip) /* check for fill factor with new entry size */ n = split->nslot; else { si = stbl[nxt]; switch (sp->header.flag & BT_TYPE) { case BT_LEAF: ldtentry = (struct ldtentry *) & sp->slot[si]; if (DO_INDEX(ip)) n = NDTLEAF(ldtentry->namlen); else n = NDTLEAF_LEGACY(ldtentry-> namlen); break; case BT_INTERNAL: idtentry = (struct idtentry *) & sp->slot[si]; n = NDTINTERNAL(idtentry->namlen); break; default: break; } ++nxt; /* advance to next entry to move in sp */ } left += n; if (left >= half) break; } /* <nxt> poins to the 1st entry to move */ /* * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * * split page moved out entries are linelocked; * new/right page moved in entries are linelocked; */ /* linelock header + stbl of new right page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 5; rdtlck->index++; dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); sp->header.nextindex = nxt; /* * finalize freelist of new right page */ fsi = rp->header.freelist; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * the skipped index was on the left page, */ if (skip <= off) { /* insert the new entry in the split page */ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); /* linelock stbl of split page */ if (sdtlck->index >= sdtlck->maxcnt) sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[sdtlck->index]; n = skip >> L2DTSLOTSIZE; slv->offset = sp->header.stblindex + n; slv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; sdtlck->index++; } /* * the skipped index was on the right page, */ else { /* adjust the skip index to reflect the new position */ skip -= nxt; /* insert the new entry in the right page */ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); } out: *rmpp = rmp; *rpxdp = *pxd; return rc; } /* * dtExtendPage() * * function: extend 1st/only directory leaf page * * parameter: * * return: 0 - success; * errno - failure; * return extended page pinned; */ static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct super_block *sb = ip->i_sb; int rc; struct metapage *smp, *pmp, *mp; dtpage_t *sp, *pp; struct pxdlist *pxdlist; pxd_t *pxd, *tpxd; int xlen, xsize; int newstblindex, newstblsize; int oldstblindex, oldstblsize; int fsi, last; struct dtslot *f; struct btframe *parent; int n; struct dt_lock *dtlck; s64 xaddr, txaddr; struct tlock *tlck; struct pxd_lock *pxdlock; struct lv *lv; uint type; struct ldtentry *ldtentry; u8 *stbl; /* get page to extend */ smp = split->mp; sp = DT_PAGE(ip, smp); /* get parent/root page */ parent = BT_POP(btstack); DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); if (rc) return (rc); /* * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; xaddr = addressPXD(pxd); tpxd = &sp->header.self; txaddr = addressPXD(tpxd); /* in-place extension */ if (xaddr == txaddr) { type = tlckEXTEND; } /* relocation */ else { type = tlckNEW; /* save moved extent descriptor for later free */ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = sp->header.self; pxdlock->index = 1; /* * Update directory index table to reflect new page address */ if (DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(sp); for (n = 0; n < sp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & sp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), xaddr, n, &mp, &lblock); } if (mp) release_metapage(mp); } } /* * extend the page */ sp->header.self = *pxd; jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the extended/leaf page */ tlck = txLock(tid, ip, smp, tlckDTREE | type); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[0]; /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; /* * copy old stbl to new stbl at start of extended area */ oldstblindex = sp->header.stblindex; oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; newstblindex = sp->header.maxslot; n = xsize >> L2DTSLOTSIZE; newstblsize = (n + 31) >> L2DTSLOTSIZE; memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], sp->header.nextindex); /* * in-line extension: linelock old area of extended page */ if (type == tlckEXTEND) { /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; lv++; /* linelock new stbl of extended page */ lv->offset = newstblindex; lv->length = newstblsize; } /* * relocation: linelock whole relocated area */ else { lv->offset = 0; lv->length = sp->header.maxslot + newstblsize; } dtlck->index++; sp->header.maxslot = n; sp->header.stblindex = newstblindex; /* sp->header.nextindex remains the same */ /* * add old stbl region at head of freelist */ fsi = oldstblindex; f = &sp->slot[fsi]; last = sp->header.freelist; for (n = 0; n < oldstblsize; n++, fsi++, f++) { f->next = last; last = fsi; } sp->header.freelist = last; sp->header.freecnt += oldstblsize; /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = newstblindex + newstblsize; f = &sp->slot[fsi]; for (fsi++; fsi < sp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) sp->header.freelist = n; else { do { f = &sp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } sp->header.freecnt += sp->header.maxslot - n; /* * insert the new entry */ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); BT_MARK_DIRTY(pmp, ip); /* * linelock any freeslots residing in old extent */ if (type == tlckEXTEND) { n = sp->header.maxslot >> 2; if (sp->header.freelist < n) dtLinelockFreelist(sp, n, &dtlck); } /* * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page */ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[dtlck->index]; /* linelock parent entry - 1st slot */ lv->offset = 1; lv->length = 1; dtlck->index++; /* update the parent pxd for page extension */ tpxd = (pxd_t *) & pp->slot[1]; *tpxd = *pxd; DT_PUTPAGE(pmp); return 0; } /* * dtSplitRoot() * * function: * split the full root page into * original/root/split page and new right page * i.e., root remains fixed in tree anchor (inode) and * the root is copied to a single new right child page * since root page << non-root page, and * the split root page contains a single entry for the * new right child page. * * parameter: * * return: 0 - success; * errno - failure; * return new page pinned; */ static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) { struct super_block *sb = ip->i_sb; struct metapage *smp; dtroot_t *sp; struct metapage *rmp; dtpage_t *rp; s64 rbn; int xlen; int xsize; struct dtslot *f; s8 *stbl; int fsi, stblsize, n; struct idtentry *s; pxd_t *ppxd; struct pxdlist *pxdlist; pxd_t *pxd; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int rc; /* get split root page */ smp = split->mp; sp = &JFS_IP(ip)->i_dtroot; /* * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; rmp = get_metapage(ip, rbn, xsize, 1); if (!rmp) return -EIO; rp = rmp->data; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); dtlck = (struct dt_lock *) & tlck->lock; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 10; /* 1 + 8 + 1 */ dtlck->index++; n = xsize >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* copy old stbl to new stbl at start of extended area */ rp->header.stblindex = DTROOTMAXSLOT; stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; memcpy(stbl, sp->header.stbl, sp->header.nextindex); rp->header.nextindex = sp->header.nextindex; /* copy old data area to start of new data area */ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = DTROOTMAXSLOT + stblsize; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) rp->header.freelist = n; else { rp->header.freelist = fsi; do { f = &rp->slot[fsi]; fsi = f->next; } while (fsi >= 0); f->next = n; } rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; struct metapage *mp = NULL; struct ldtentry *ldtentry; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the root page (in-memory inode) */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; /* update page header of root */ if (sp->header.flag & BT_LEAF) { sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; } /* init the first entry */ s = (struct idtentry *) & sp->slot[DTENTRYSTART]; ppxd = (pxd_t *) s; *ppxd = *pxd; s->next = -1; s->namlen = 0; stbl = sp->header.stbl; stbl[0] = DTENTRYSTART; sp->header.nextindex = 1; /* init freelist */ fsi = DTENTRYSTART + 1; f = &sp->slot[fsi]; /* init free region of remaining area */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; sp->header.freelist = DTENTRYSTART + 1; sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); *rmpp = rmp; return 0; } /* * dtDelete() * * function: delete the entry(s) referenced by a key. * * parameter: * * return: */ int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, ino_t * ino, int flag) { int rc = 0; s64 bn; struct metapage *mp, *imp; dtpage_t *p; int index; struct btstack btstack; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int i; struct ldtentry *ldtentry; u8 *stbl; u32 table_index, next_index; struct metapage *nmp; dtpage_t *np; /* * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ if ((rc = dtSearch(ip, key, ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* * We need to find put the index of the next entry into the * directory index table in order to resume a readdir from this * entry. */ if (DO_INDEX(ip)) { stbl = DT_GETSTBL(p); ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; table_index = le32_to_cpu(ldtentry->index); if (index == (p->header.nextindex - 1)) { /* * Last entry in this leaf page */ if ((p->header.flag & BT_ROOT) || (p->header.next == 0)) next_index = -1; else { /* Read next leaf page */ DT_GETPAGE(ip, le64_to_cpu(p->header.next), nmp, PSIZE, np, rc); if (rc) next_index = -1; else { stbl = DT_GETSTBL(np); ldtentry = (struct ldtentry *) & np-> slot[stbl[0]]; next_index = le32_to_cpu(ldtentry->index); DT_PUTPAGE(nmp); } } } else { ldtentry = (struct ldtentry *) & p->slot[stbl[index + 1]]; next_index = le32_to_cpu(ldtentry->index); } free_index(tid, ip, table_index, next_index); } /* * the leaf page becomes empty, delete the page */ if (p->header.nextindex == 1) { /* delete empty page */ rc = dtDeleteUp(tid, ip, mp, p, &btstack); } /* * the leaf page has other entries remaining: * * delete the entry from the leaf page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* * Do not assume that dtlck->index will be zero. During a * rename within a directory, this transaction may have * modified this page already when adding the new entry. */ /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the leaf entry */ dtDeleteEntry(p, index, &dtlck); /* * Update directory index table for entries moved in stbl */ if (DO_INDEX(ip) && index < p->header.nextindex) { s64 lblock; imp = NULL; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { ldtentry = (struct ldtentry *) & p->slot[stbl[i]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), bn, i, &imp, &lblock); } if (imp) release_metapage(imp); } DT_PUTPAGE(mp); } return rc; } /* * dtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; dtpage_t *p; int index, nextindex; int xlen; struct btframe *parent; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; struct pxd_lock *pxdlock; int i; /* * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(fmp); return 0; } /* * free the non-root leaf page */ /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor, and * the buffer page is freed; */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = fp->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, fp))) { BT_PUTPAGE(fmp); return rc; } xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); /* * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* pin the parent page <sp> */ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; /* * free the extent of the child page deleted */ index = parent->index; /* * delete the entry for the child page from parent */ nextindex = p->header.nextindex; /* * the parent has the single entry being deleted: * * free the parent page which has become empty. */ if (nextindex == 1) { /* * keep the root internal page which has become empty */ if (p->header.flag & BT_ROOT) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(mp); return 0; } /* * free the parent page */ else { /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = p->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, p))) { DT_PUTPAGE(mp); return rc; } xlen = lengthPXD(&p->header.self); /* Free quota allocation */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * * delete the router entry from the parent page. */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the page * * action: router entry deletion */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the router entry */ dtDeleteEntry(p, index, &dtlck); /* reset key of new leftmost entry of level (for consistency) */ if (index == 0 && ((p->header.flag & BT_ROOT) || p->header.prev == 0)) dtTruncateEntry(p, 0, &dtlck); /* unpin the parent page */ DT_PUTPAGE(mp); /* exit propagation up */ break; } if (!DO_INDEX(ip)) ip->i_size -= PSIZE; return 0; } /* * dtRelink() * * function: * link around a freed page. * * parameter: * fp: page to be freed * * return: */ static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) { int rc; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page * * action: update prev pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(prevbn); DT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the prev page * * action: update next pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.next = cpu_to_le64(nextbn); DT_PUTPAGE(mp); } return 0; } /* * dtInitRoot() * * initialize directory root (inline in inode) */ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); dtroot_t *p; int fsi; struct dtslot *f; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; u16 xflag_save; /* * If this was previously an non-empty directory, we need to remove * the old directory table. */ if (DO_INDEX(ip)) { if (!jfs_dirtable_inline(ip)) { struct tblock *tblk = tid_to_tblock(tid); /* * We're playing games with the tid's xflag. If * we're removing a regular file, the file's xtree * is committed with COMMIT_PMAP, but we always * commit the directories xtree with COMMIT_PWMAP. */ xflag_save = tblk->xflag; tblk->xflag = 0; /* * xtTruncate isn't guaranteed to fully truncate * the xtree. The caller needs to check i_size * after committing the transaction to see if * additional truncation is needed. The * COMMIT_Stale flag tells caller that we * initiated the truncation. */ xtTruncate(tid, ip, 0, COMMIT_PWMAP); set_cflag(COMMIT_Stale, ip); tblk->xflag = xflag_save; } else ip->i_size = 1; jfs_ip->next_index = 2; } else ip->i_size = IDATASIZE; /* * acquire a transaction lock on the root * * action: directory initialization; */ tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, tlckDTREE | tlckENTRY | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; p = &jfs_ip->i_dtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = 0; /* init freelist */ fsi = 1; f = &p->slot[fsi]; /* init data area of root */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; p->header.freelist = 1; p->header.freecnt = 8; /* init '..' entry */ p->header.idotdot = cpu_to_le32(idotdot); return; } /* * add_missing_indices() * * function: Fix dtree page in which one or more entries has an invalid index. * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ static int add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; int i; uint index; struct lv *lv; struct metapage *mp; dtpage_t *p; int rc = 0; s8 *stbl; tid_t tid; struct tlock *tlck; tid = txBegin(inode->i_sb, 0); DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); if (rc) { printk(KERN_ERR "DT_GETPAGE failed!\n"); goto end; } BT_MARK_DIRTY(mp, inode); ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); if (BT_IS_ROOT(mp)) tlck->type |= tlckBTROOT; dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { if (stbl[i] < 0) { jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)inode->i_ino, (long long)bn); rc = -EIO; DT_PUTPAGE(mp); txAbort(tid, 0); goto end; } d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { d->index = cpu_to_le32(add_index(tid, inode, bn, i)); if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = &dtlck->lv[dtlck->index]; lv->offset = stbl[i]; lv->length = 1; dtlck->index++; } } DT_PUTPAGE(mp); (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); return rc; } /* * Buffer to hold directory entry info while traversing a dtree page * before being fed to the filldir function */ struct jfs_dirent { loff_t position; int ino; u16 name_len; char name[]; }; /* * function to determine next variable-sized jfs_dirent in buffer */ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) { return (struct jfs_dirent *) ((char *)dirent + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + sizeof (loff_t) - 1) & ~(sizeof (loff_t) - 1))); } /* * jfs_readdir() * * function: read directory entries sequentially * from the specified entry offset * * parameter: * * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ int jfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) &dtpos; s64 bn; struct metapage *mp; dtpage_t *p; int index; s8 *stbl; struct btstack btstack; int i, next; struct ldtentry *d; struct dtslot *t; int d_namleft, len, outlen; unsigned long dirent_buf; char *name_ptr; u32 dir_index; int do_index = 0; uint loop_count = 0; struct jfs_dirent *jfs_dirent; int jfs_dirents; int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. * Special cases: 0 = . * 1 = .. * -1 = End of directory */ do_index = 1; dir_index = (u32) ctx->pos; /* * NFSv4 reserves cookies 1 and 2 for . and .. so the value * we return to the vfs is one greater than the one we use * internally. */ if (dir_index) dir_index--; if (dir_index > 1) { struct dir_table_slot dirtab_slot; if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { ctx->pos = DIREND; return 0; } goto repeat; } bn = addressDTS(&dirtab_slot); index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); ctx->pos = DIREND; return 0; } } else { if (dir_index == 0) { /* * self "." */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadFirst(ip, &btstack))) return rc; DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } } else { /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * * pn = 0; index = 1: First entry "." * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; if (dtpos < 2) { /* build "." entry */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; ctx->pos = dtpos; } if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", rc); ctx->pos = DIREND; return 0; } /* get start leaf page and index */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* offset beyond directory eof ? */ if (bn < 0) { ctx->pos = DIREND; return 0; } } dirent_buf = __get_free_page(GFP_KERNEL); if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); ctx->pos = DIREND; return -ENOMEM; } while (1) { jfs_dirent = (struct jfs_dirent *) dirent_buf; jfs_dirents = 0; overflow = fix_page = 0; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { if (stbl[i] < 0 || stbl[i] > 127) { jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)ip->i_ino, (long long)bn); free_page(dirent_buf); DT_PUTPAGE(mp); return -EIO; } d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > (dirent_buf + PAGE_SIZE)) { /* DBCS codepages could overrun dirent_buf */ index = i; overflow = 1; break; } d_namleft = d->namlen; name_ptr = jfs_dirent->name; jfs_dirent->ino = le32_to_cpu(d->inumber); if (do_index) { len = min(d_namleft, DTLHDRDATALEN); jfs_dirent->position = le32_to_cpu(d->index); /* * d->index should always be valid, but it * isn't. fsck.jfs doesn't create the * directory index for the lost+found * directory. Rather than let it go, * we can try to fix it. */ if ((jfs_dirent->position < 2) || (jfs_dirent->position >= JFS_IP(ip)->next_index)) { if (!page_fixed && !isReadOnly(ip)) { fix_page = 1; /* * setting overflow and setting * index to i will cause the * same page to be processed * again starting here */ overflow = 1; index = i; break; } jfs_dirent->position = unique_pos++; } /* * We add 1 to the index because we may * use a value of 2 internally, and NFSv4 * doesn't like that. */ jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); } /* copy the name of head/only segment */ outlen = jfs_strfromUCS_le(name_ptr, d->name, len, codepage); jfs_dirent->name_len = outlen; /* copy name in the additional segment(s) */ next = d->next; while (next >= 0) { t = (struct dtslot *) & p->slot[next]; name_ptr += outlen; d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); outlen = jfs_strfromUCS_le(name_ptr, t->name, len, codepage); jfs_dirent->name_len += outlen; next = t->next; } jfs_dirents++; jfs_dirent = next_jfs_dirent(jfs_dirent); skip_one: if (!do_index) dtoffset->index++; } if (!overflow) { /* Point to next leaf page */ if (p->header.flag & BT_ROOT) bn = 0; else { bn = le64_to_cpu(p->header.next); index = 0; /* update offset (pn:index) for new page */ if (!do_index) { dtoffset->pn++; dtoffset->index = 0; } } page_fixed = 0; } /* unpin previous leaf page */ DT_PUTPAGE(mp); jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { ctx->pos = jfs_dirent->position; if (!dir_emit(ctx, jfs_dirent->name, jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); } if (fix_page) { if ((rc = add_missing_indices(ip, bn))) goto out; page_fixed = 1; } if (!overflow && (bn == 0)) { ctx->pos = DIREND; break; } DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { free_page(dirent_buf); return rc; } } out: free_page(dirent_buf); return rc; } /* * dtReadFirst() * * function: get the leftmost page of the directory */ static int dtReadFirst(struct inode *ip, struct btstack * btstack) { int rc = 0; s64 bn; int psize = 288; /* initial in-line directory */ struct metapage *mp; dtpage_t *p; s8 *stbl; struct btframe *btsp; pxd_t *xd; BT_CLR(btstack); /* reset stack */ /* * descend leftmost path of the tree * * by convention, root bn = 0. */ for (bn = 0;;) { DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) return rc; /* * leftmost leaf page */ if (p->header.flag & BT_LEAF) { /* return leftmost entry */ btsp = btstack->top; btsp->bn = bn; btsp->index = 0; btsp->mp = mp; return 0; } /* * descend down to leftmost child page */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, 0); /* get the leftmost entry */ stbl = DT_GETSTBL(p); if (stbl[0] < 0 || stbl[0] > 127) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "stbl[0] out of bound\n"); return -EIO; } xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ bn = addressPXD(xd); psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } } /* * dtReadNext() * * function: get the page of the specified offset (pn:index) * * return: if (offset > eof), bn = -1; * * note: if index > nextindex of the target leaf page, * start with 1st entry of next leaf page; */ static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack) { int rc = 0; struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) offset; s64 bn; struct metapage *mp; dtpage_t *p; int index; int pn; s8 *stbl; struct btframe *btsp, *parent; pxd_t *xd; /* * get leftmost leaf page pinned */ if ((rc = dtReadFirst(ip, btstack))) return rc; /* get leaf page */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* get the start offset (pn:index) */ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ index = dtoffset->index; /* start at leftmost page ? */ if (pn == 0) { /* offset beyond eof ? */ if (index < p->header.nextindex) goto out; if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = index = 0; goto a; } /* start at non-leftmost page: scan parent pages for large pn */ if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start after next leaf page ? */ if (pn > 1) goto b; /* get leaf page pn = 1 */ a: bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } goto c; /* * scan last internal page level to get target leaf page */ b: /* unpin leftmost leaf page */ DT_PUTPAGE(mp); /* get left most parent page */ btsp = btstack->top; parent = btsp - 1; bn = parent->bn; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* scan parent pages at last internal page level */ while (pn >= p->header.nextindex) { pn -= p->header.nextindex; /* get next parent page address */ bn = le64_to_cpu(p->header.next); /* unpin current parent page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next parent page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* update parent page stack frame */ parent->bn = bn; } /* get leaf page address */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[pn]]; bn = addressPXD(xd); /* unpin parent page */ DT_PUTPAGE(mp); /* * get target leaf page */ c: DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * leaf page has been completed: * start with 1st entry of next leaf page */ if (index >= p->header.nextindex) { bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next leaf page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = 0; } out: /* return target leaf page pinned */ btsp = btstack->top; btsp->bn = bn; btsp->index = dtoffset->index; btsp->mp = mp; return 0; } /* * dtCompare() * * function: compare search key with an internal entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int dtCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si) { /* entry slot index */ wchar_t *kname; __le16 *name; int klen, namlen, len, rc; struct idtentry *ih; struct dtslot *t; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); /* compare with head/only segment */ len = min(klen, len); if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; /* compare with additional segment(s) */ kname += len; while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; kname += len; si = t->next; } return (klen - namlen); } /* * ciCompare() * * function: compare search key with an (leaf/internal) entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int ciCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si, /* entry slot index */ int flag) { wchar_t *kname, x; __le16 *name; int klen, namlen, len, rc; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int i; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; /* * leaf page entry */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; name = lh->name; namlen = lh->namlen; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } /* * internal page entry */ else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); } /* compare with head/only segment */ len = min(klen, len); for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; /* compare with additional segment(s) */ while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; si = t->next; } return (klen - namlen); } /* * ciGetLeafPrefixKey() * * function: compute prefix of suffix compression * from two adjacent leaf entries * across page boundary * * return: non-zero on error * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) { int klen, namlen; wchar_t *pl, *pr, *kname; struct component_name lkey; struct component_name rkey; lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); return -ENOMEM; } /* get left and right key */ dtGetKey(lp, li, &lkey, flag); lkey.name[lkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&lkey); dtGetKey(rp, ri, &rkey, flag); rkey.name[rkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&rkey); /* compute prefix */ klen = 0; kname = key->name; namlen = min(lkey.namlen, rkey.namlen); for (pl = lkey.name, pr = rkey.name; namlen; pl++, pr++, namlen--, klen++, kname++) { *kname = *pr; if (*pl != *pr) { key->namlen = klen + 1; goto free_names; } } /* l->namlen <= r->namlen since l <= r */ if (lkey.namlen < rkey.namlen) { *kname = *pr; key->namlen = klen + 1; } else /* l->namelen == r->namelen */ key->namlen = klen; free_names: kfree(lkey.name); kfree(rkey.name); return 0; } /* * dtGetKey() * * function: get key of the entry */ static void dtGetKey(dtpage_t * p, int i, /* entry index */ struct component_name * key, int flag) { int si; s8 *stbl; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int namlen, len; wchar_t *kname; __le16 *name; /* get entry */ stbl = DT_GETSTBL(p); si = stbl[i]; if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; namlen = lh->namlen; name = lh->name; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; namlen = ih->namlen; name = ih->name; len = min(namlen, DTIHDRDATALEN); } key->namlen = namlen; kname = key->name; /* * move head/only segment */ UniStrncpy_from_le(kname, name, len); /* * move additional segment(s) */ while (si >= 0) { /* get next segment */ t = &p->slot[si]; kname += len; namlen -= len; len = min(namlen, DTSLOTDATALEN); UniStrncpy_from_le(kname, t->name, len); si = t->next; } } /* * dtInsertEntry() * * function: allocate free slot(s) and * write a leaf/internal entry * * return: entry slot index */ static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock ** dtlock) { struct dtslot *h, *t; struct ldtentry *lh = NULL; struct idtentry *ih = NULL; int hsi, fsi, klen, len, nextindex; wchar_t *kname; __le16 *name; s8 *stbl; pxd_t *xd; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; s64 bn = 0; struct metapage *mp = NULL; klen = key->namlen; kname = key->name; /* allocate a free slot */ hsi = fsi = p->header.freelist; h = &p->slot[fsi]; p->header.freelist = h->next; --p->header.freecnt; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = hsi; /* write head/only segment */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) h; lh->next = h->next; lh->inumber = cpu_to_le32(data->leaf.ino); lh->namlen = klen; name = lh->name; if (data->leaf.ip) { len = min(klen, DTLHDRDATALEN); if (!(p->header.flag & BT_ROOT)) bn = addressPXD(&p->header.self); lh->index = cpu_to_le32(add_index(data->leaf.tid, data->leaf.ip, bn, index)); } else len = min(klen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) h; ih->next = h->next; xd = (pxd_t *) ih; *xd = data->xd; ih->namlen = klen; name = ih->name; len = min(klen, DTIHDRDATALEN); } UniStrncpy_to_le(name, kname, len); n = 1; xsi = hsi; /* write additional segment(s) */ t = h; klen -= len; while (klen) { /* get free slot */ fsi = p->header.freelist; t = &p->slot[fsi]; p->header.freelist = t->next; --p->header.freecnt; /* is next slot contiguous ? */ if (fsi != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = fsi; n = 0; } kname += len; len = min(klen, DTSLOTDATALEN); UniStrncpy_to_le(t->name, kname, len); n++; xsi = fsi; klen -= len; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* terminate last/only segment */ if (h == t) { /* single segment entry */ if (p->header.flag & BT_LEAF) lh->next = -1; else ih->next = -1; } else /* multi-segment entry */ t->next = -1; /* if insert into middle, shift right succeeding entries in stbl */ stbl = DT_GETSTBL(p); nextindex = p->header.nextindex; if (index < nextindex) { memmove(stbl + index + 1, stbl + index, nextindex - index); if ((p->header.flag & BT_LEAF) && data->leaf.ip) { s64 lblock; /* * Need to update slot number for entries that moved * in the stbl */ mp = NULL; for (n = index + 1; n <= nextindex; n++) { lh = (struct ldtentry *) & (p->slot[stbl[n]]); modify_index(data->leaf.tid, data->leaf.ip, le32_to_cpu(lh->index), bn, n, &mp, &lblock); } if (mp) release_metapage(mp); } } stbl[index] = hsi; /* advance next available entry index of stbl */ ++p->header.nextindex; } /* * dtMoveEntry() * * function: move entries from split/left page to new/right page * * nextindex of dst page and freelist/freecnt of both pages * are updated. */ static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index) { int ssi, next; /* src slot index */ int di; /* dst entry index */ int dsi; /* dst slot index */ s8 *sstbl, *dstbl; /* sorted entry table */ int snamlen, len; struct ldtentry *slh, *dlh = NULL; struct idtentry *sih, *dih = NULL; struct dtslot *h, *s, *d; struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; struct lv *slv, *dlv; int xssi, ns, nd; int sfsi; sstbl = (s8 *) & sp->slot[sp->header.stblindex]; dstbl = (s8 *) & dp->slot[dp->header.stblindex]; dsi = dp->header.freelist; /* first (whole page) free slot */ sfsi = sp->header.freelist; /* linelock destination entry slot */ dlv = & ddtlck->lv[ddtlck->index]; dlv->offset = dsi; /* linelock source entry slot */ slv = & sdtlck->lv[sdtlck->index]; slv->offset = sstbl[si]; xssi = slv->offset - 1; /* * move entries */ ns = nd = 0; for (di = 0; si < sp->header.nextindex; si++, di++) { ssi = sstbl[si]; dstbl[di] = dsi; /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* * move head/only segment of an entry */ /* get dst slot */ h = d = &dp->slot[dsi]; /* get src slot and move */ s = &sp->slot[ssi]; if (sp->header.flag & BT_LEAF) { /* get source entry */ slh = (struct ldtentry *) s; dlh = (struct ldtentry *) h; snamlen = slh->namlen; if (do_index) { len = min(snamlen, DTLHDRDATALEN); dlh->index = slh->index; /* little-endian */ } else len = min(snamlen, DTLHDRDATALEN_LEGACY); memcpy(dlh, slh, 6 + len * 2); next = slh->next; /* update dst head/only segment next field */ dsi++; dlh->next = dsi; } else { sih = (struct idtentry *) s; snamlen = sih->namlen; len = min(snamlen, DTIHDRDATALEN); dih = (struct idtentry *) h; memcpy(dih, sih, 10 + len * 2); next = sih->next; dsi++; dih->next = dsi; } /* free src head/only segment */ s->next = sfsi; s->cnt = 1; sfsi = ssi; ns++; nd++; xssi = ssi; /* * move additional segment(s) of the entry */ snamlen -= len; while ((ssi = next) >= 0) { /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* get next source segment */ s = &sp->slot[ssi]; /* get next destination free slot */ d++; len = min(snamlen, DTSLOTDATALEN); UniStrncpy_le(d->name, s->name, len); ns++; nd++; xssi = ssi; dsi++; d->next = dsi; /* free source segment */ next = s->next; s->next = sfsi; s->cnt = 1; sfsi = ssi; snamlen -= len; } /* end while */ /* terminate dst last/only segment */ if (h == d) { /* single segment entry */ if (dp->header.flag & BT_LEAF) dlh->next = -1; else dih->next = -1; } else /* multi-segment entry */ d->next = -1; } /* end for */ /* close current linelock */ slv->length = ns; sdtlck->index++; *sdtlock = sdtlck; dlv->length = nd; ddtlck->index++; *ddtlock = ddtlck; /* update source header */ sp->header.freelist = sfsi; sp->header.freecnt += nd; /* update destination header */ dp->header.nextindex = di; dp->header.freelist = dsi; dp->header.freecnt -= nd; } /* * dtDeleteEntry() * * function: free a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); fsi = stbl[fi]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; /* get the head/only segment */ t = &p->slot[fsi]; if (p->header.flag & BT_LEAF) si = ((struct ldtentry *) t)->next; else si = ((struct idtentry *) t)->next; t->next = si; t->cnt = 1; n = freecnt = 1; xsi = fsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; /* if delete from middle, * shift left the succedding entries in the stbl */ si = p->header.nextindex; if (fi < si - 1) memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); p->header.nextindex--; } /* * dtTruncateEntry() * * function: truncate a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) { int tsi; /* truncate entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int fsi, xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); tsi = stbl[ti]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = tsi; /* get the head/only segment */ t = &p->slot[tsi]; ASSERT(p->header.flag & BT_INTERNAL); ((struct idtentry *) t)->namlen = 0; si = ((struct idtentry *) t)->next; ((struct idtentry *) t)->next = -1; n = 1; freecnt = 0; fsi = si; xsi = tsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ if (freecnt == 0) return; t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; } /* * dtLinelockFreelist() */ static void dtLinelockFreelist(dtpage_t * p, /* directory page */ int m, /* max slot index */ struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ struct dtslot *t; int si; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ fsi = p->header.freelist; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; n = 1; xsi = fsi; t = &p->slot[fsi]; si = t->next; /* find the last/only segment */ while (si < m && si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; t = &p->slot[si]; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; } /* * NAME: dtModify * * FUNCTION: Modify the inode number part of a directory entry * * PARAMETERS: * tid - Transaction id * ip - Inode of parent directory * key - Name of entry to be modified * orig_ino - Original inode number expected in entry * new_ino - New inode number to put into entry * flag - JFS_RENAME * * RETURNS: * -ESTALE - If entry found does not match orig_ino passed in * -ENOENT - If no entry can be found to match key * 0 - If successfully modified entry */ int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) { int rc; s64 bn; struct metapage *mp; dtpage_t *p; int index; struct btstack btstack; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; s8 *stbl; int entry_si; /* entry slot index */ struct ldtentry *entry; /* * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page of named entry */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* get slot index of the entry */ stbl = DT_GETSTBL(p); entry_si = stbl[index]; /* linelock entry */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = entry_si; lv->length = 1; dtlck->index++; /* get the head/only segment */ entry = (struct ldtentry *) & p->slot[entry_si]; /* substitute the inode number of the entry */ entry->inumber = cpu_to_le32(new_ino); /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; }
1 1 4 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 // SPDX-License-Identifier: GPL-2.0-only /* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ #include <linux/rpl_iptunnel.h> #include <net/dst_cache.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/ipv6.h> #include <net/rpl.h> struct rpl_iptunnel_encap { DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { struct dst_cache cache; struct rpl_iptunnel_encap tuninfo; }; static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct rpl_lwt *)lwt->data; } static inline struct rpl_iptunnel_encap * rpl_encap_lwtunnel(struct lwtunnel_state *lwt) { return &rpl_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = { [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh, size_t seglen) { int err; if ((srh->hdrlen << 3) != seglen) return false; /* check at least one segment and seglen fit with segments_left */ if (!srh->segments_left || (srh->segments_left * sizeof(struct in6_addr)) != seglen) return false; if (srh->cmpri || srh->cmpre) return false; err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr, srh->segments_left); if (err) return false; if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) & IPV6_ADDR_MULTICAST) return false; return true; } static int rpl_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[RPL_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; struct ipv6_rpl_sr_hdr *srh; struct rpl_lwt *rlwt; int err, srh_len; if (family != AF_INET6) return -EINVAL; err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla, rpl_iptunnel_policy, extack); if (err < 0) return err; if (!tb[RPL_IPTUNNEL_SRH]) return -EINVAL; srh = nla_data(tb[RPL_IPTUNNEL_SRH]); srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]); if (srh_len < sizeof(*srh)) return -EINVAL; /* verify that SRH is consistent */ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh))) return -EINVAL; newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt)); if (!newts) return -ENOMEM; rlwt = rpl_lwt_lwtunnel(newts); err = dst_cache_init(&rlwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&rlwt->tuninfo.srh, srh, srh_len); newts->type = LWTUNNEL_ENCAP_RPL; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; *ts = newts; return 0; } static void rpl_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache); } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, const struct ipv6_rpl_sr_hdr *srh, struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; struct ipv6hdr oldhdr; struct ipv6hdr *hdr; unsigned char *buf; size_t hdrlen; int err; memcpy(&oldhdr, ipv6_hdr(skb), sizeof(oldhdr)); buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) return -ENOMEM; isrh = (struct ipv6_rpl_sr_hdr *)buf; csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3)); memcpy(isrh, srh, sizeof(*isrh)); memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], (srh->segments_left - 1) * 16); isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr.daddr; ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], isrh->segments_left - 1); hdrlen = ((csrh->hdrlen + 1) << 3); err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; } skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, &oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, csrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; hdr->daddr = srh->rpl_segaddr[0]; ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); kfree(buf); return 0; } static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; tinfo = rpl_encap_lwtunnel(dst->lwtstate); return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); err = rpl_do_srh(skb, rlwt, dst); if (unlikely(err)) goto drop; if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; goto drop; } /* cache only if we don't create a dst reference loop */ if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); drop: dst_release(dst); kfree_skb(skb); return err; } static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; /* We cannot dereference "orig_dst" once ip6_route_input() or * skb_dst_drop() is called. However, in order to detect a dst loop, we * need the address of its lwtstate. So, save the address of lwtstate * now and use it later as a comparison. */ lwtst = orig_dst->lwtstate; rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); err = rpl_do_srh(skb, rlwt, dst); if (unlikely(err)) { dst_release(dst); goto drop; } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); local_bh_enable(); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } return dst_input(skb); drop: kfree_skb(skb); return err; } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, struct rpl_iptunnel_encap *tuninfo) { struct rpl_iptunnel_encap *data; struct nlattr *nla; int len; len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo->srh, len); return 0; } static int rpl_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh)); } static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a); struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b); int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh); if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops rpl_ops = { .build_state = rpl_build_state, .destroy_state = rpl_destroy_state, .output = rpl_output, .input = rpl_input, .fill_encap = rpl_fill_encap_info, .get_encap_size = rpl_encap_nlsize, .cmp_encap = rpl_encap_cmp, .owner = THIS_MODULE, }; int __init rpl_init(void) { int err; err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); if (err) goto out; pr_info("RPL Segment Routing with IPv6\n"); return 0; out: return err; } void rpl_exit(void) { lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); }
85 71 2 15 1 1 1 102 88 5 7 5 7 103 82 90 103 78 1 72 78 73 77 1 1 1 2 1 1 4 2 2 3 2 1 2 1 1 2 1 1 1 2 1 1 1 3 2 1 1 1 1 7 1 6 1 14 7 4 3 30 8 1 7 14 14 16 7 1 2 6 6 9 9 1 40 10 30 13 29 45 1 2 1 1 41 1 18 1 2 1 2 22 2 7 1 8 1 15 7 3 5 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 /* * Copyright (C) 2014 Red Hat * Copyright (C) 2014 Intel Corp. * Copyright (C) 2018 Intel Corp. * Copyright (c) 2020, The Linux Foundation. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rob Clark <robdclark@gmail.com> * Daniel Vetter <daniel.vetter@ffwll.ch> */ #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_uapi.h> #include <drm/drm_framebuffer.h> #include <drm/drm_print.h> #include <drm/drm_drv.h> #include <drm/drm_writeback.h> #include <drm/drm_vblank.h> #include <linux/export.h> #include <linux/dma-fence.h> #include <linux/uaccess.h> #include <linux/sync_file.h> #include <linux/file.h> #include "drm_crtc_internal.h" /** * DOC: overview * * This file contains the marshalling and demarshalling glue for the atomic UAPI * in all its forms: The monster ATOMIC IOCTL itself, code for GET_PROPERTY and * SET_PROPERTY IOCTLs. Plus interface functions for compatibility helpers and * drivers which have special needs to construct their own atomic updates, e.g. * for load detect or similar. */ /** * drm_atomic_set_mode_for_crtc - set mode for CRTC * @state: the CRTC whose incoming state to update * @mode: kernel-internal mode to use for the CRTC, or NULL to disable * * Set a mode (originating from the kernel) on the desired CRTC state and update * the enable property. * * RETURNS: * Zero on success, error code on failure. Cannot return -EDEADLK. */ int drm_atomic_set_mode_for_crtc(struct drm_crtc_state *state, const struct drm_display_mode *mode) { struct drm_crtc *crtc = state->crtc; struct drm_mode_modeinfo umode; /* Early return for no change. */ if (mode && memcmp(&state->mode, mode, sizeof(*mode)) == 0) return 0; drm_property_blob_put(state->mode_blob); state->mode_blob = NULL; if (mode) { struct drm_property_blob *blob; drm_mode_convert_to_umode(&umode, mode); blob = drm_property_create_blob(crtc->dev, sizeof(umode), &umode); if (IS_ERR(blob)) return PTR_ERR(blob); drm_mode_copy(&state->mode, mode); state->mode_blob = blob; state->enable = true; drm_dbg_atomic(crtc->dev, "Set [MODE:%s] for [CRTC:%d:%s] state %p\n", mode->name, crtc->base.id, crtc->name, state); } else { memset(&state->mode, 0, sizeof(state->mode)); state->enable = false; drm_dbg_atomic(crtc->dev, "Set [NOMODE] for [CRTC:%d:%s] state %p\n", crtc->base.id, crtc->name, state); } return 0; } EXPORT_SYMBOL(drm_atomic_set_mode_for_crtc); /** * drm_atomic_set_mode_prop_for_crtc - set mode for CRTC * @state: the CRTC whose incoming state to update * @blob: pointer to blob property to use for mode * * Set a mode (originating from a blob property) on the desired CRTC state. * This function will take a reference on the blob property for the CRTC state, * and release the reference held on the state's existing mode property, if any * was set. * * RETURNS: * Zero on success, error code on failure. Cannot return -EDEADLK. */ int drm_atomic_set_mode_prop_for_crtc(struct drm_crtc_state *state, struct drm_property_blob *blob) { struct drm_crtc *crtc = state->crtc; if (blob == state->mode_blob) return 0; drm_property_blob_put(state->mode_blob); state->mode_blob = NULL; memset(&state->mode, 0, sizeof(state->mode)); if (blob) { int ret; if (blob->length != sizeof(struct drm_mode_modeinfo)) { drm_dbg_atomic(crtc->dev, "[CRTC:%d:%s] bad mode blob length: %zu\n", crtc->base.id, crtc->name, blob->length); return -EINVAL; } ret = drm_mode_convert_umode(crtc->dev, &state->mode, blob->data); if (ret) { drm_dbg_atomic(crtc->dev, "[CRTC:%d:%s] invalid mode (%s, %pe): " DRM_MODE_FMT "\n", crtc->base.id, crtc->name, drm_get_mode_status_name(state->mode.status), ERR_PTR(ret), DRM_MODE_ARG(&state->mode)); return -EINVAL; } state->mode_blob = drm_property_blob_get(blob); state->enable = true; drm_dbg_atomic(crtc->dev, "Set [MODE:%s] for [CRTC:%d:%s] state %p\n", state->mode.name, crtc->base.id, crtc->name, state); } else { state->enable = false; drm_dbg_atomic(crtc->dev, "Set [NOMODE] for [CRTC:%d:%s] state %p\n", crtc->base.id, crtc->name, state); } return 0; } EXPORT_SYMBOL(drm_atomic_set_mode_prop_for_crtc); /** * drm_atomic_set_crtc_for_plane - set CRTC for plane * @plane_state: the plane whose incoming state to update * @crtc: CRTC to use for the plane * * Changing the assigned CRTC for a plane requires us to grab the lock and state * for the new CRTC, as needed. This function takes care of all these details * besides updating the pointer in the state object itself. * * Returns: * 0 on success or can fail with -EDEADLK or -ENOMEM. When the error is EDEADLK * then the w/w mutex code has detected a deadlock and the entire atomic * sequence must be restarted. All other errors are fatal. */ int drm_atomic_set_crtc_for_plane(struct drm_plane_state *plane_state, struct drm_crtc *crtc) { struct drm_plane *plane = plane_state->plane; struct drm_crtc_state *crtc_state; /* Nothing to do for same crtc*/ if (plane_state->crtc == crtc) return 0; if (plane_state->crtc) { crtc_state = drm_atomic_get_crtc_state(plane_state->state, plane_state->crtc); if (WARN_ON(IS_ERR(crtc_state))) return PTR_ERR(crtc_state); crtc_state->plane_mask &= ~drm_plane_mask(plane); } plane_state->crtc = crtc; if (crtc) { crtc_state = drm_atomic_get_crtc_state(plane_state->state, crtc); if (IS_ERR(crtc_state)) return PTR_ERR(crtc_state); crtc_state->plane_mask |= drm_plane_mask(plane); } if (crtc) drm_dbg_atomic(plane->dev, "Link [PLANE:%d:%s] state %p to [CRTC:%d:%s]\n", plane->base.id, plane->name, plane_state, crtc->base.id, crtc->name); else drm_dbg_atomic(plane->dev, "Link [PLANE:%d:%s] state %p to [NOCRTC]\n", plane->base.id, plane->name, plane_state); return 0; } EXPORT_SYMBOL(drm_atomic_set_crtc_for_plane); /** * drm_atomic_set_fb_for_plane - set framebuffer for plane * @plane_state: atomic state object for the plane * @fb: fb to use for the plane * * Changing the assigned framebuffer for a plane requires us to grab a reference * to the new fb and drop the reference to the old fb, if there is one. This * function takes care of all these details besides updating the pointer in the * state object itself. */ void drm_atomic_set_fb_for_plane(struct drm_plane_state *plane_state, struct drm_framebuffer *fb) { struct drm_plane *plane = plane_state->plane; if (fb) drm_dbg_atomic(plane->dev, "Set [FB:%d] for [PLANE:%d:%s] state %p\n", fb->base.id, plane->base.id, plane->name, plane_state); else drm_dbg_atomic(plane->dev, "Set [NOFB] for [PLANE:%d:%s] state %p\n", plane->base.id, plane->name, plane_state); drm_framebuffer_assign(&plane_state->fb, fb); } EXPORT_SYMBOL(drm_atomic_set_fb_for_plane); /** * drm_atomic_set_crtc_for_connector - set CRTC for connector * @conn_state: atomic state object for the connector * @crtc: CRTC to use for the connector * * Changing the assigned CRTC for a connector requires us to grab the lock and * state for the new CRTC, as needed. This function takes care of all these * details besides updating the pointer in the state object itself. * * Returns: * 0 on success or can fail with -EDEADLK or -ENOMEM. When the error is EDEADLK * then the w/w mutex code has detected a deadlock and the entire atomic * sequence must be restarted. All other errors are fatal. */ int drm_atomic_set_crtc_for_connector(struct drm_connector_state *conn_state, struct drm_crtc *crtc) { struct drm_connector *connector = conn_state->connector; struct drm_crtc_state *crtc_state; if (conn_state->crtc == crtc) return 0; if (conn_state->crtc) { crtc_state = drm_atomic_get_new_crtc_state(conn_state->state, conn_state->crtc); crtc_state->connector_mask &= ~drm_connector_mask(conn_state->connector); drm_connector_put(conn_state->connector); conn_state->crtc = NULL; } if (crtc) { crtc_state = drm_atomic_get_crtc_state(conn_state->state, crtc); if (IS_ERR(crtc_state)) return PTR_ERR(crtc_state); crtc_state->connector_mask |= drm_connector_mask(conn_state->connector); drm_connector_get(conn_state->connector); conn_state->crtc = crtc; drm_dbg_atomic(connector->dev, "Link [CONNECTOR:%d:%s] state %p to [CRTC:%d:%s]\n", connector->base.id, connector->name, conn_state, crtc->base.id, crtc->name); } else { drm_dbg_atomic(connector->dev, "Link [CONNECTOR:%d:%s] state %p to [NOCRTC]\n", connector->base.id, connector->name, conn_state); } return 0; } EXPORT_SYMBOL(drm_atomic_set_crtc_for_connector); static void set_out_fence_for_crtc(struct drm_atomic_state *state, struct drm_crtc *crtc, s32 __user *fence_ptr) { state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = fence_ptr; } static s32 __user *get_out_fence_for_crtc(struct drm_atomic_state *state, struct drm_crtc *crtc) { s32 __user *fence_ptr; fence_ptr = state->crtcs[drm_crtc_index(crtc)].out_fence_ptr; state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = NULL; return fence_ptr; } static int set_out_fence_for_connector(struct drm_atomic_state *state, struct drm_connector *connector, s32 __user *fence_ptr) { unsigned int index = drm_connector_index(connector); if (!fence_ptr) return 0; if (put_user(-1, fence_ptr)) return -EFAULT; state->connectors[index].out_fence_ptr = fence_ptr; return 0; } static s32 __user *get_out_fence_for_connector(struct drm_atomic_state *state, struct drm_connector *connector) { unsigned int index = drm_connector_index(connector); s32 __user *fence_ptr; fence_ptr = state->connectors[index].out_fence_ptr; state->connectors[index].out_fence_ptr = NULL; return fence_ptr; } static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, struct drm_crtc_state *state, struct drm_property *property, uint64_t val) { struct drm_device *dev = crtc->dev; struct drm_mode_config *config = &dev->mode_config; bool replaced = false; int ret; if (property == config->prop_active) state->active = val; else if (property == config->prop_mode_id) { struct drm_property_blob *mode = drm_property_lookup_blob(dev, val); ret = drm_atomic_set_mode_prop_for_crtc(state, mode); drm_property_blob_put(mode); return ret; } else if (property == config->prop_vrr_enabled) { state->vrr_enabled = val; } else if (property == config->degamma_lut_property) { ret = drm_property_replace_blob_from_id(dev, &state->degamma_lut, val, -1, sizeof(struct drm_color_lut), &replaced); state->color_mgmt_changed |= replaced; return ret; } else if (property == config->ctm_property) { ret = drm_property_replace_blob_from_id(dev, &state->ctm, val, sizeof(struct drm_color_ctm), -1, &replaced); state->color_mgmt_changed |= replaced; return ret; } else if (property == config->gamma_lut_property) { ret = drm_property_replace_blob_from_id(dev, &state->gamma_lut, val, -1, sizeof(struct drm_color_lut), &replaced); state->color_mgmt_changed |= replaced; return ret; } else if (property == config->prop_out_fence_ptr) { s32 __user *fence_ptr = u64_to_user_ptr(val); if (!fence_ptr) return 0; if (put_user(-1, fence_ptr)) return -EFAULT; set_out_fence_for_crtc(state->state, crtc, fence_ptr); } else if (property == crtc->scaling_filter_property) { state->scaling_filter = val; } else if (crtc->funcs->atomic_set_property) { return crtc->funcs->atomic_set_property(crtc, state, property, val); } else { drm_dbg_atomic(crtc->dev, "[CRTC:%d:%s] unknown property [PROP:%d:%s]\n", crtc->base.id, crtc->name, property->base.id, property->name); return -EINVAL; } return 0; } static int drm_atomic_crtc_get_property(struct drm_crtc *crtc, const struct drm_crtc_state *state, struct drm_property *property, uint64_t *val) { struct drm_device *dev = crtc->dev; struct drm_mode_config *config = &dev->mode_config; if (property == config->prop_active) *val = drm_atomic_crtc_effectively_active(state); else if (property == config->prop_mode_id) *val = (state->mode_blob) ? state->mode_blob->base.id : 0; else if (property == config->prop_vrr_enabled) *val = state->vrr_enabled; else if (property == config->degamma_lut_property) *val = (state->degamma_lut) ? state->degamma_lut->base.id : 0; else if (property == config->ctm_property) *val = (state->ctm) ? state->ctm->base.id : 0; else if (property == config->gamma_lut_property) *val = (state->gamma_lut) ? state->gamma_lut->base.id : 0; else if (property == config->prop_out_fence_ptr) *val = 0; else if (property == crtc->scaling_filter_property) *val = state->scaling_filter; else if (crtc->funcs->atomic_get_property) return crtc->funcs->atomic_get_property(crtc, state, property, val); else { drm_dbg_atomic(dev, "[CRTC:%d:%s] unknown property [PROP:%d:%s]\n", crtc->base.id, crtc->name, property->base.id, property->name); return -EINVAL; } return 0; } static int drm_atomic_plane_set_property(struct drm_plane *plane, struct drm_plane_state *state, struct drm_file *file_priv, struct drm_property *property, uint64_t val) { struct drm_device *dev = plane->dev; struct drm_mode_config *config = &dev->mode_config; bool replaced = false; int ret; if (property == config->prop_fb_id) { struct drm_framebuffer *fb; fb = drm_framebuffer_lookup(dev, file_priv, val); drm_atomic_set_fb_for_plane(state, fb); if (fb) drm_framebuffer_put(fb); } else if (property == config->prop_in_fence_fd) { if (state->fence) return -EINVAL; if (U642I64(val) == -1) return 0; state->fence = sync_file_get_fence(val); if (!state->fence) return -EINVAL; } else if (property == config->prop_crtc_id) { struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val); if (val && !crtc) { drm_dbg_atomic(dev, "[PROP:%d:%s] cannot find CRTC with ID %llu\n", property->base.id, property->name, val); return -EACCES; } return drm_atomic_set_crtc_for_plane(state, crtc); } else if (property == config->prop_crtc_x) { state->crtc_x = U642I64(val); } else if (property == config->prop_crtc_y) { state->crtc_y = U642I64(val); } else if (property == config->prop_crtc_w) { state->crtc_w = val; } else if (property == config->prop_crtc_h) { state->crtc_h = val; } else if (property == config->prop_src_x) { state->src_x = val; } else if (property == config->prop_src_y) { state->src_y = val; } else if (property == config->prop_src_w) { state->src_w = val; } else if (property == config->prop_src_h) { state->src_h = val; } else if (property == plane->alpha_property) { state->alpha = val; } else if (property == plane->blend_mode_property) { state->pixel_blend_mode = val; } else if (property == plane->rotation_property) { if (!is_power_of_2(val & DRM_MODE_ROTATE_MASK)) { drm_dbg_atomic(plane->dev, "[PLANE:%d:%s] bad rotation bitmask: 0x%llx\n", plane->base.id, plane->name, val); return -EINVAL; } state->rotation = val; } else if (property == plane->zpos_property) { state->zpos = val; } else if (property == plane->color_encoding_property) { state->color_encoding = val; } else if (property == plane->color_range_property) { state->color_range = val; } else if (property == config->prop_fb_damage_clips) { ret = drm_property_replace_blob_from_id(dev, &state->fb_damage_clips, val, -1, sizeof(struct drm_mode_rect), &replaced); return ret; } else if (property == plane->scaling_filter_property) { state->scaling_filter = val; } else if (plane->funcs->atomic_set_property) { return plane->funcs->atomic_set_property(plane, state, property, val); } else if (property == plane->hotspot_x_property) { if (plane->type != DRM_PLANE_TYPE_CURSOR) { drm_dbg_atomic(plane->dev, "[PLANE:%d:%s] is not a cursor plane: 0x%llx\n", plane->base.id, plane->name, val); return -EINVAL; } state->hotspot_x = val; } else if (property == plane->hotspot_y_property) { if (plane->type != DRM_PLANE_TYPE_CURSOR) { drm_dbg_atomic(plane->dev, "[PLANE:%d:%s] is not a cursor plane: 0x%llx\n", plane->base.id, plane->name, val); return -EINVAL; } state->hotspot_y = val; } else { drm_dbg_atomic(plane->dev, "[PLANE:%d:%s] unknown property [PROP:%d:%s]\n", plane->base.id, plane->name, property->base.id, property->name); return -EINVAL; } return 0; } static int drm_atomic_plane_get_property(struct drm_plane *plane, const struct drm_plane_state *state, struct drm_property *property, uint64_t *val) { struct drm_device *dev = plane->dev; struct drm_mode_config *config = &dev->mode_config; if (property == config->prop_fb_id) { *val = (state->fb) ? state->fb->base.id : 0; } else if (property == config->prop_in_fence_fd) { *val = -1; } else if (property == config->prop_crtc_id) { *val = (state->crtc) ? state->crtc->base.id : 0; } else if (property == config->prop_crtc_x) { *val = I642U64(state->crtc_x); } else if (property == config->prop_crtc_y) { *val = I642U64(state->crtc_y); } else if (property == config->prop_crtc_w) { *val = state->crtc_w; } else if (property == config->prop_crtc_h) { *val = state->crtc_h; } else if (property == config->prop_src_x) { *val = state->src_x; } else if (property == config->prop_src_y) { *val = state->src_y; } else if (property == config->prop_src_w) { *val = state->src_w; } else if (property == config->prop_src_h) { *val = state->src_h; } else if (property == plane->alpha_property) { *val = state->alpha; } else if (property == plane->blend_mode_property) { *val = state->pixel_blend_mode; } else if (property == plane->rotation_property) { *val = state->rotation; } else if (property == plane->zpos_property) { *val = state->zpos; } else if (property == plane->color_encoding_property) { *val = state->color_encoding; } else if (property == plane->color_range_property) { *val = state->color_range; } else if (property == config->prop_fb_damage_clips) { *val = (state->fb_damage_clips) ? state->fb_damage_clips->base.id : 0; } else if (property == plane->scaling_filter_property) { *val = state->scaling_filter; } else if (plane->funcs->atomic_get_property) { return plane->funcs->atomic_get_property(plane, state, property, val); } else if (property == plane->hotspot_x_property) { *val = state->hotspot_x; } else if (property == plane->hotspot_y_property) { *val = state->hotspot_y; } else { drm_dbg_atomic(dev, "[PLANE:%d:%s] unknown property [PROP:%d:%s]\n", plane->base.id, plane->name, property->base.id, property->name); return -EINVAL; } return 0; } static int drm_atomic_set_writeback_fb_for_connector( struct drm_connector_state *conn_state, struct drm_framebuffer *fb) { int ret; struct drm_connector *conn = conn_state->connector; ret = drm_writeback_set_fb(conn_state, fb); if (ret < 0) return ret; if (fb) drm_dbg_atomic(conn->dev, "Set [FB:%d] for connector state %p\n", fb->base.id, conn_state); else drm_dbg_atomic(conn->dev, "Set [NOFB] for connector state %p\n", conn_state); return 0; } static int drm_atomic_connector_set_property(struct drm_connector *connector, struct drm_connector_state *state, struct drm_file *file_priv, struct drm_property *property, uint64_t val) { struct drm_device *dev = connector->dev; struct drm_mode_config *config = &dev->mode_config; bool replaced = false; int ret; if (property == config->prop_crtc_id) { struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val); if (val && !crtc) { drm_dbg_atomic(dev, "[PROP:%d:%s] cannot find CRTC with ID %llu\n", property->base.id, property->name, val); return -EACCES; } return drm_atomic_set_crtc_for_connector(state, crtc); } else if (property == config->dpms_property) { /* setting DPMS property requires special handling, which * is done in legacy setprop path for us. Disallow (for * now?) atomic writes to DPMS property: */ drm_dbg_atomic(dev, "legacy [PROP:%d:%s] can only be set via legacy uAPI\n", property->base.id, property->name); return -EINVAL; } else if (property == config->tv_select_subconnector_property) { state->tv.select_subconnector = val; } else if (property == config->tv_subconnector_property) { state->tv.subconnector = val; } else if (property == config->tv_left_margin_property) { state->tv.margins.left = val; } else if (property == config->tv_right_margin_property) { state->tv.margins.right = val; } else if (property == config->tv_top_margin_property) { state->tv.margins.top = val; } else if (property == config->tv_bottom_margin_property) { state->tv.margins.bottom = val; } else if (property == config->legacy_tv_mode_property) { state->tv.legacy_mode = val; } else if (property == config->tv_mode_property) { state->tv.mode = val; } else if (property == config->tv_brightness_property) { state->tv.brightness = val; } else if (property == config->tv_contrast_property) { state->tv.contrast = val; } else if (property == config->tv_flicker_reduction_property) { state->tv.flicker_reduction = val; } else if (property == config->tv_overscan_property) { state->tv.overscan = val; } else if (property == config->tv_saturation_property) { state->tv.saturation = val; } else if (property == config->tv_hue_property) { state->tv.hue = val; } else if (property == config->link_status_property) { /* Never downgrade from GOOD to BAD on userspace's request here, * only hw issues can do that. * * For an atomic property the userspace doesn't need to be able * to understand all the properties, but needs to be able to * restore the state it wants on VT switch. So if the userspace * tries to change the link_status from GOOD to BAD, driver * silently rejects it and returns a 0. This prevents userspace * from accidentally breaking the display when it restores the * state. */ if (state->link_status != DRM_LINK_STATUS_GOOD) state->link_status = val; } else if (property == config->hdr_output_metadata_property) { ret = drm_property_replace_blob_from_id(dev, &state->hdr_output_metadata, val, sizeof(struct hdr_output_metadata), -1, &replaced); return ret; } else if (property == config->aspect_ratio_property) { state->picture_aspect_ratio = val; } else if (property == config->content_type_property) { state->content_type = val; } else if (property == connector->scaling_mode_property) { state->scaling_mode = val; } else if (property == config->content_protection_property) { if (val == DRM_MODE_CONTENT_PROTECTION_ENABLED) { drm_dbg_kms(dev, "only drivers can set CP Enabled\n"); return -EINVAL; } state->content_protection = val; } else if (property == config->hdcp_content_type_property) { state->hdcp_content_type = val; } else if (property == connector->colorspace_property) { state->colorspace = val; } else if (property == config->writeback_fb_id_property) { struct drm_framebuffer *fb; int ret; fb = drm_framebuffer_lookup(dev, file_priv, val); ret = drm_atomic_set_writeback_fb_for_connector(state, fb); if (fb) drm_framebuffer_put(fb); return ret; } else if (property == config->writeback_out_fence_ptr_property) { s32 __user *fence_ptr = u64_to_user_ptr(val); return set_out_fence_for_connector(state->state, connector, fence_ptr); } else if (property == connector->max_bpc_property) { state->max_requested_bpc = val; } else if (property == connector->privacy_screen_sw_state_property) { state->privacy_screen_sw_state = val; } else if (property == connector->broadcast_rgb_property) { state->hdmi.broadcast_rgb = val; } else if (connector->funcs->atomic_set_property) { return connector->funcs->atomic_set_property(connector, state, property, val); } else { drm_dbg_atomic(connector->dev, "[CONNECTOR:%d:%s] unknown property [PROP:%d:%s]\n", connector->base.id, connector->name, property->base.id, property->name); return -EINVAL; } return 0; } static int drm_atomic_connector_get_property(struct drm_connector *connector, const struct drm_connector_state *state, struct drm_property *property, uint64_t *val) { struct drm_device *dev = connector->dev; struct drm_mode_config *config = &dev->mode_config; if (property == config->prop_crtc_id) { *val = (state->crtc) ? state->crtc->base.id : 0; } else if (property == config->dpms_property) { if (state->crtc && state->crtc->state->self_refresh_active) *val = DRM_MODE_DPMS_ON; else *val = connector->dpms; } else if (property == config->tv_select_subconnector_property) { *val = state->tv.select_subconnector; } else if (property == config->tv_subconnector_property) { *val = state->tv.subconnector; } else if (property == config->tv_left_margin_property) { *val = state->tv.margins.left; } else if (property == config->tv_right_margin_property) { *val = state->tv.margins.right; } else if (property == config->tv_top_margin_property) { *val = state->tv.margins.top; } else if (property == config->tv_bottom_margin_property) { *val = state->tv.margins.bottom; } else if (property == config->legacy_tv_mode_property) { *val = state->tv.legacy_mode; } else if (property == config->tv_mode_property) { *val = state->tv.mode; } else if (property == config->tv_brightness_property) { *val = state->tv.brightness; } else if (property == config->tv_contrast_property) { *val = state->tv.contrast; } else if (property == config->tv_flicker_reduction_property) { *val = state->tv.flicker_reduction; } else if (property == config->tv_overscan_property) { *val = state->tv.overscan; } else if (property == config->tv_saturation_property) { *val = state->tv.saturation; } else if (property == config->tv_hue_property) { *val = state->tv.hue; } else if (property == config->link_status_property) { *val = state->link_status; } else if (property == config->aspect_ratio_property) { *val = state->picture_aspect_ratio; } else if (property == config->content_type_property) { *val = state->content_type; } else if (property == connector->colorspace_property) { *val = state->colorspace; } else if (property == connector->scaling_mode_property) { *val = state->scaling_mode; } else if (property == config->hdr_output_metadata_property) { *val = state->hdr_output_metadata ? state->hdr_output_metadata->base.id : 0; } else if (property == config->content_protection_property) { *val = state->content_protection; } else if (property == config->hdcp_content_type_property) { *val = state->hdcp_content_type; } else if (property == config->writeback_fb_id_property) { /* Writeback framebuffer is one-shot, write and forget */ *val = 0; } else if (property == config->writeback_out_fence_ptr_property) { *val = 0; } else if (property == connector->max_bpc_property) { *val = state->max_requested_bpc; } else if (property == connector->privacy_screen_sw_state_property) { *val = state->privacy_screen_sw_state; } else if (property == connector->broadcast_rgb_property) { *val = state->hdmi.broadcast_rgb; } else if (connector->funcs->atomic_get_property) { return connector->funcs->atomic_get_property(connector, state, property, val); } else { drm_dbg_atomic(dev, "[CONNECTOR:%d:%s] unknown property [PROP:%d:%s]\n", connector->base.id, connector->name, property->base.id, property->name); return -EINVAL; } return 0; } int drm_atomic_get_property(struct drm_mode_object *obj, struct drm_property *property, uint64_t *val) { struct drm_device *dev = property->dev; int ret; switch (obj->type) { case DRM_MODE_OBJECT_CONNECTOR: { struct drm_connector *connector = obj_to_connector(obj); WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex)); ret = drm_atomic_connector_get_property(connector, connector->state, property, val); break; } case DRM_MODE_OBJECT_CRTC: { struct drm_crtc *crtc = obj_to_crtc(obj); WARN_ON(!drm_modeset_is_locked(&crtc->mutex)); ret = drm_atomic_crtc_get_property(crtc, crtc->state, property, val); break; } case DRM_MODE_OBJECT_PLANE: { struct drm_plane *plane = obj_to_plane(obj); WARN_ON(!drm_modeset_is_locked(&plane->mutex)); ret = drm_atomic_plane_get_property(plane, plane->state, property, val); break; } default: drm_dbg_atomic(dev, "[OBJECT:%d] has no properties\n", obj->id); ret = -EINVAL; break; } return ret; } /* * The big monster ioctl */ static struct drm_pending_vblank_event *create_vblank_event( struct drm_crtc *crtc, uint64_t user_data) { struct drm_pending_vblank_event *e = NULL; e = kzalloc(sizeof *e, GFP_KERNEL); if (!e) return NULL; e->event.base.type = DRM_EVENT_FLIP_COMPLETE; e->event.base.length = sizeof(e->event); e->event.vbl.crtc_id = crtc->base.id; e->event.vbl.user_data = user_data; return e; } int drm_atomic_connector_commit_dpms(struct drm_atomic_state *state, struct drm_connector *connector, int mode) { struct drm_connector *tmp_connector; struct drm_connector_state *new_conn_state; struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; int i, ret, old_mode = connector->dpms; bool active = false; ret = drm_modeset_lock(&state->dev->mode_config.connection_mutex, state->acquire_ctx); if (ret) return ret; if (mode != DRM_MODE_DPMS_ON) mode = DRM_MODE_DPMS_OFF; if (connector->dpms == mode) goto out; connector->dpms = mode; crtc = connector->state->crtc; if (!crtc) goto out; ret = drm_atomic_add_affected_connectors(state, crtc); if (ret) goto out; crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto out; } for_each_new_connector_in_state(state, tmp_connector, new_conn_state, i) { if (new_conn_state->crtc != crtc) continue; if (tmp_connector->dpms == DRM_MODE_DPMS_ON) { active = true; break; } } crtc_state->active = active; ret = drm_atomic_commit(state); out: if (ret != 0) connector->dpms = old_mode; return ret; } static int drm_atomic_check_prop_changes(int ret, uint64_t old_val, uint64_t prop_value, struct drm_property *prop) { if (ret != 0 || old_val != prop_value) { drm_dbg_atomic(prop->dev, "[PROP:%d:%s] No prop can be changed during async flip\n", prop->base.id, prop->name); return -EINVAL; } return 0; } int drm_atomic_set_property(struct drm_atomic_state *state, struct drm_file *file_priv, struct drm_mode_object *obj, struct drm_property *prop, u64 prop_value, bool async_flip) { struct drm_mode_object *ref; u64 old_val; int ret; if (!drm_property_change_valid_get(prop, prop_value, &ref)) return -EINVAL; switch (obj->type) { case DRM_MODE_OBJECT_CONNECTOR: { struct drm_connector *connector = obj_to_connector(obj); struct drm_connector_state *connector_state; connector_state = drm_atomic_get_connector_state(state, connector); if (IS_ERR(connector_state)) { ret = PTR_ERR(connector_state); break; } if (async_flip) { ret = drm_atomic_connector_get_property(connector, connector_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); break; } ret = drm_atomic_connector_set_property(connector, connector_state, file_priv, prop, prop_value); break; } case DRM_MODE_OBJECT_CRTC: { struct drm_crtc *crtc = obj_to_crtc(obj); struct drm_crtc_state *crtc_state; crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); break; } if (async_flip) { ret = drm_atomic_crtc_get_property(crtc, crtc_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); break; } ret = drm_atomic_crtc_set_property(crtc, crtc_state, prop, prop_value); break; } case DRM_MODE_OBJECT_PLANE: { struct drm_plane *plane = obj_to_plane(obj); struct drm_plane_state *plane_state; struct drm_mode_config *config = &plane->dev->mode_config; const struct drm_plane_helper_funcs *plane_funcs = plane->helper_private; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); break; } if (async_flip) { /* check if the prop does a nop change */ if ((prop != config->prop_fb_id && prop != config->prop_in_fence_fd && prop != config->prop_fb_damage_clips)) { ret = drm_atomic_plane_get_property(plane, plane_state, prop, &old_val); ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); } /* ask the driver if this non-primary plane is supported */ if (plane->type != DRM_PLANE_TYPE_PRIMARY) { ret = -EINVAL; if (plane_funcs && plane_funcs->atomic_async_check) ret = plane_funcs->atomic_async_check(plane, state, true); if (ret) { drm_dbg_atomic(prop->dev, "[PLANE:%d:%s] does not support async flips\n", obj->id, plane->name); break; } } } ret = drm_atomic_plane_set_property(plane, plane_state, file_priv, prop, prop_value); break; } default: drm_dbg_atomic(prop->dev, "[OBJECT:%d] has no properties\n", obj->id); ret = -EINVAL; break; } drm_property_change_valid_put(prop, ref); return ret; } /** * DOC: explicit fencing properties * * Explicit fencing allows userspace to control the buffer synchronization * between devices. A Fence or a group of fences are transferred to/from * userspace using Sync File fds and there are two DRM properties for that. * IN_FENCE_FD on each DRM Plane to send fences to the kernel and * OUT_FENCE_PTR on each DRM CRTC to receive fences from the kernel. * * As a contrast, with implicit fencing the kernel keeps track of any * ongoing rendering, and automatically ensures that the atomic update waits * for any pending rendering to complete. This is usually tracked in &struct * dma_resv which can also contain mandatory kernel fences. Implicit syncing * is how Linux traditionally worked (e.g. DRI2/3 on X.org), whereas explicit * fencing is what Android wants. * * "IN_FENCE_FD”: * Use this property to pass a fence that DRM should wait on before * proceeding with the Atomic Commit request and show the framebuffer for * the plane on the screen. The fence can be either a normal fence or a * merged one, the sync_file framework will handle both cases and use a * fence_array if a merged fence is received. Passing -1 here means no * fences to wait on. * * If the Atomic Commit request has the DRM_MODE_ATOMIC_TEST_ONLY flag * it will only check if the Sync File is a valid one. * * On the driver side the fence is stored on the @fence parameter of * &struct drm_plane_state. Drivers which also support implicit fencing * should extract the implicit fence using drm_gem_plane_helper_prepare_fb(), * to make sure there's consistent behaviour between drivers in precedence * of implicit vs. explicit fencing. * * "OUT_FENCE_PTR”: * Use this property to pass a file descriptor pointer to DRM. Once the * Atomic Commit request call returns OUT_FENCE_PTR will be filled with * the file descriptor number of a Sync File. This Sync File contains the * CRTC fence that will be signaled when all framebuffers present on the * Atomic Commit * request for that given CRTC are scanned out on the * screen. * * The Atomic Commit request fails if a invalid pointer is passed. If the * Atomic Commit request fails for any other reason the out fence fd * returned will be -1. On a Atomic Commit with the * DRM_MODE_ATOMIC_TEST_ONLY flag the out fence will also be set to -1. * * Note that out-fences don't have a special interface to drivers and are * internally represented by a &struct drm_pending_vblank_event in struct * &drm_crtc_state, which is also used by the nonblocking atomic commit * helpers and for the DRM event handling for existing userspace. */ struct drm_out_fence_state { s32 __user *out_fence_ptr; struct sync_file *sync_file; int fd; }; static int setup_out_fence(struct drm_out_fence_state *fence_state, struct dma_fence *fence) { fence_state->fd = get_unused_fd_flags(O_CLOEXEC); if (fence_state->fd < 0) return fence_state->fd; if (put_user(fence_state->fd, fence_state->out_fence_ptr)) return -EFAULT; fence_state->sync_file = sync_file_create(fence); if (!fence_state->sync_file) return -ENOMEM; return 0; } static int prepare_signaling(struct drm_device *dev, struct drm_atomic_state *state, struct drm_mode_atomic *arg, struct drm_file *file_priv, struct drm_out_fence_state **fence_state, unsigned int *num_fences) { struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; struct drm_connector *conn; struct drm_connector_state *conn_state; int i, c = 0, ret; if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) return 0; for_each_new_crtc_in_state(state, crtc, crtc_state, i) { s32 __user *fence_ptr; fence_ptr = get_out_fence_for_crtc(crtc_state->state, crtc); if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT || fence_ptr) { struct drm_pending_vblank_event *e; e = create_vblank_event(crtc, arg->user_data); if (!e) return -ENOMEM; crtc_state->event = e; } if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) { struct drm_pending_vblank_event *e = crtc_state->event; if (!file_priv) continue; ret = drm_event_reserve_init(dev, file_priv, &e->base, &e->event.base); if (ret) { kfree(e); crtc_state->event = NULL; return ret; } } if (fence_ptr) { struct dma_fence *fence; struct drm_out_fence_state *f; f = krealloc(*fence_state, sizeof(**fence_state) * (*num_fences + 1), GFP_KERNEL); if (!f) return -ENOMEM; memset(&f[*num_fences], 0, sizeof(*f)); f[*num_fences].out_fence_ptr = fence_ptr; *fence_state = f; fence = drm_crtc_create_fence(crtc); if (!fence) return -ENOMEM; ret = setup_out_fence(&f[(*num_fences)++], fence); if (ret) { dma_fence_put(fence); return ret; } crtc_state->event->base.fence = fence; } c++; } for_each_new_connector_in_state(state, conn, conn_state, i) { struct drm_writeback_connector *wb_conn; struct drm_out_fence_state *f; struct dma_fence *fence; s32 __user *fence_ptr; if (!conn_state->writeback_job) continue; fence_ptr = get_out_fence_for_connector(state, conn); if (!fence_ptr) continue; f = krealloc(*fence_state, sizeof(**fence_state) * (*num_fences + 1), GFP_KERNEL); if (!f) return -ENOMEM; memset(&f[*num_fences], 0, sizeof(*f)); f[*num_fences].out_fence_ptr = fence_ptr; *fence_state = f; wb_conn = drm_connector_to_writeback(conn); fence = drm_writeback_get_out_fence(wb_conn); if (!fence) return -ENOMEM; ret = setup_out_fence(&f[(*num_fences)++], fence); if (ret) { dma_fence_put(fence); return ret; } conn_state->writeback_job->out_fence = fence; } /* * Having this flag means user mode pends on event which will never * reach due to lack of at least one CRTC for signaling */ if (c == 0 && (arg->flags & DRM_MODE_PAGE_FLIP_EVENT)) { drm_dbg_atomic(dev, "need at least one CRTC for DRM_MODE_PAGE_FLIP_EVENT"); return -EINVAL; } return 0; } static void complete_signaling(struct drm_device *dev, struct drm_atomic_state *state, struct drm_out_fence_state *fence_state, unsigned int num_fences, bool install_fds) { struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; int i; if (install_fds) { for (i = 0; i < num_fences; i++) fd_install(fence_state[i].fd, fence_state[i].sync_file->file); kfree(fence_state); return; } for_each_new_crtc_in_state(state, crtc, crtc_state, i) { struct drm_pending_vblank_event *event = crtc_state->event; /* * Free the allocated event. drm_atomic_helper_setup_commit * can allocate an event too, so only free it if it's ours * to prevent a double free in drm_atomic_state_clear. */ if (event && (event->base.fence || event->base.file_priv)) { drm_event_cancel_free(dev, &event->base); crtc_state->event = NULL; } } if (!fence_state) return; for (i = 0; i < num_fences; i++) { if (fence_state[i].sync_file) fput(fence_state[i].sync_file->file); if (fence_state[i].fd >= 0) put_unused_fd(fence_state[i].fd); /* If this fails log error to the user */ if (fence_state[i].out_fence_ptr && put_user(-1, fence_state[i].out_fence_ptr)) drm_dbg_atomic(dev, "Couldn't clear out_fence_ptr\n"); } kfree(fence_state); } static void set_async_flip(struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; int i; for_each_new_crtc_in_state(state, crtc, crtc_state, i) { crtc_state->async_flip = true; } } int drm_mode_atomic_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_atomic *arg = data; uint32_t __user *objs_ptr = (uint32_t __user *)(unsigned long)(arg->objs_ptr); uint32_t __user *count_props_ptr = (uint32_t __user *)(unsigned long)(arg->count_props_ptr); uint32_t __user *props_ptr = (uint32_t __user *)(unsigned long)(arg->props_ptr); uint64_t __user *prop_values_ptr = (uint64_t __user *)(unsigned long)(arg->prop_values_ptr); unsigned int copied_objs, copied_props; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_out_fence_state *fence_state; int ret = 0; unsigned int i, j, num_fences; bool async_flip = false; /* disallow for drivers not supporting atomic: */ if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) return -EOPNOTSUPP; /* disallow for userspace that has not enabled atomic cap (even * though this may be a bit overkill, since legacy userspace * wouldn't know how to call this ioctl) */ if (!file_priv->atomic) { drm_dbg_atomic(dev, "commit failed: atomic cap not enabled\n"); return -EINVAL; } if (arg->flags & ~DRM_MODE_ATOMIC_FLAGS) { drm_dbg_atomic(dev, "commit failed: invalid flag\n"); return -EINVAL; } if (arg->reserved) { drm_dbg_atomic(dev, "commit failed: reserved field set\n"); return -EINVAL; } if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) { if (!dev->mode_config.async_page_flip) { drm_dbg_atomic(dev, "commit failed: DRM_MODE_PAGE_FLIP_ASYNC not supported\n"); return -EINVAL; } async_flip = true; } /* can't test and expect an event at the same time. */ if ((arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) && (arg->flags & DRM_MODE_PAGE_FLIP_EVENT)) { drm_dbg_atomic(dev, "commit failed: page-flip event requested with test-only commit\n"); return -EINVAL; } state = drm_atomic_state_alloc(dev); if (!state) return -ENOMEM; drm_modeset_acquire_init(&ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE); state->acquire_ctx = &ctx; state->allow_modeset = !!(arg->flags & DRM_MODE_ATOMIC_ALLOW_MODESET); retry: copied_objs = 0; copied_props = 0; fence_state = NULL; num_fences = 0; for (i = 0; i < arg->count_objs; i++) { uint32_t obj_id, count_props; struct drm_mode_object *obj; if (get_user(obj_id, objs_ptr + copied_objs)) { ret = -EFAULT; goto out; } obj = drm_mode_object_find(dev, file_priv, obj_id, DRM_MODE_OBJECT_ANY); if (!obj) { drm_dbg_atomic(dev, "cannot find object ID %d", obj_id); ret = -ENOENT; goto out; } if (!obj->properties) { drm_dbg_atomic(dev, "[OBJECT:%d] has no properties", obj_id); drm_mode_object_put(obj); ret = -ENOENT; goto out; } if (get_user(count_props, count_props_ptr + copied_objs)) { drm_mode_object_put(obj); ret = -EFAULT; goto out; } copied_objs++; for (j = 0; j < count_props; j++) { uint32_t prop_id; uint64_t prop_value; struct drm_property *prop; if (get_user(prop_id, props_ptr + copied_props)) { drm_mode_object_put(obj); ret = -EFAULT; goto out; } prop = drm_mode_obj_find_prop_id(obj, prop_id); if (!prop) { drm_dbg_atomic(dev, "[OBJECT:%d] cannot find property ID %d", obj_id, prop_id); drm_mode_object_put(obj); ret = -ENOENT; goto out; } if (copy_from_user(&prop_value, prop_values_ptr + copied_props, sizeof(prop_value))) { drm_mode_object_put(obj); ret = -EFAULT; goto out; } ret = drm_atomic_set_property(state, file_priv, obj, prop, prop_value, async_flip); if (ret) { drm_mode_object_put(obj); goto out; } copied_props++; } drm_mode_object_put(obj); } ret = prepare_signaling(dev, state, arg, file_priv, &fence_state, &num_fences); if (ret) goto out; if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) set_async_flip(state); if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) { ret = drm_atomic_check_only(state); } else if (arg->flags & DRM_MODE_ATOMIC_NONBLOCK) { ret = drm_atomic_nonblocking_commit(state); } else { ret = drm_atomic_commit(state); } out: complete_signaling(dev, state, fence_state, num_fences, !ret); if (ret == -EDEADLK) { drm_atomic_state_clear(state); ret = drm_modeset_backoff(&ctx); if (!ret) goto retry; } drm_atomic_state_put(state); drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; }
5 6 2 5 5 26 16 6 21 1 1 1 1 1 1 1 1 1 1 4 1 1 2 11 7 1 32 32 1 4 3 1 9 15 6 14 5 5 5 4 4 2 2 2 2 1 1 2 2 2 2 2 1 1 29 1 3 1 2 25 1 1 22 4 1 18 19 5 10 4 3 3 2 11 11 11 11 11 1 10 16 17 17 29 34 34 4 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/hpfs/super.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * mounting, unmounting, error handling */ #include "hpfs_fn.h" #include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> #include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ static void mark_dirty(struct super_block *s, int remount) { if (hpfs_sb(s)->sb_chkdsk && (remount || !sb_rdonly(s))) { struct buffer_head *bh; struct hpfs_spare_block *sb; if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = 1; sb->old_wrote = 0; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } } /* Mark the filesystem clean (mark it dirty for chkdsk if chkdsk==2 or if there were errors) */ static void unmark_dirty(struct super_block *s) { struct buffer_head *bh; struct hpfs_spare_block *sb; if (sb_rdonly(s)) return; sync_blockdev(s->s_bdev); if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } } /* Filesystem error... */ void hpfs_error(struct super_block *s, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("filesystem error: %pV", &vaf); va_end(args); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { if (sb_rdonly(s)) pr_cont("; already mounted read-only\n"); else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= SB_RDONLY; } } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); else pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); } else pr_cont("\n"); hpfs_sb(s)->sb_was_error = 1; } /* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * * BTW. chkdsk doesn't detect cycles correctly. When I had 2 lost directories * nested each in other, chkdsk locked up happilly. */ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, char *msg) { if (*c2 && *c1 == key) { hpfs_error(s, "cycle detected on key %08x in %s", key, msg); return 1; } (*c2)++; if (!((*c2 - 1) & *c2)) *c1 = key; return 0; } static void free_sbi(struct hpfs_sb_info *sbi) { kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); kfree(sbi); } static void lazy_free_sbi(struct rcu_head *rcu) { free_sbi(container_of(rcu, struct hpfs_sb_info, rcu)); } static void hpfs_put_super(struct super_block *s) { hpfs_lock(s); unmark_dirty(s); hpfs_unlock(s); call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); } static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; unsigned long *bits; unsigned count; bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) return (unsigned)-1; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; } static unsigned count_bitmaps(struct super_block *s) { unsigned n, count, n_bands; n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; for (n = 0; n < COUNT_RD_AHEAD; n++) { hpfs_prefetch_bitmap(s, n); } for (n = 0; n < n_bands; n++) { unsigned c; hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); if (c != (unsigned)-1) count += c; } return count; } unsigned hpfs_get_free_dnodes(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free_dnodes == (unsigned)-1) { unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap); if (c == (unsigned)-1) return 0; sbi->sb_n_free_dnodes = c; } return sbi->sb_n_free_dnodes; } static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); hpfs_lock(s); if (sbi->sb_n_free == (unsigned)-1) sbi->sb_n_free = count_bitmaps(s); buf->f_type = s->s_magic; buf->f_bsize = 512; buf->f_blocks = sbi->sb_fs_size; buf->f_bfree = sbi->sb_n_free; buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = hpfs_get_free_dnodes(s); buf->f_fsid = u64_to_fsid(id); buf->f_namelen = 254; hpfs_unlock(s); return 0; } long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { switch (cmd) { case FITRIM: { struct fstrim_range range; secno n_trimmed; int r; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); if (r) return r; range.len = (u64)n_trimmed << 9; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } default: { return -ENOIOCTLCMD; } } } static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; } static void hpfs_free_inode(struct inode *inode) { kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hpfs_inode_cachep); } enum { Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; static const struct constant_table hpfs_param_case[] = { {"asis", 0}, {"lower", 1}, {} }; static const struct constant_table hpfs_param_check[] = { {"none", 0}, {"normal", 1}, {"strict", 2}, {} }; static const struct constant_table hpfs_param_err[] = { {"continue", 0}, {"remount-ro", 1}, {"panic", 2}, {} }; static const struct constant_table hpfs_param_eas[] = { {"no", 0}, {"ro", 1}, {"rw", 2}, {} }; static const struct constant_table hpfs_param_chkdsk[] = { {"no", 0}, {"errors", 1}, {"always", 2}, {} }; static const struct fs_parameter_spec hpfs_param_spec[] = { fsparam_flag ("help", Opt_help), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_enum ("case", Opt_case, hpfs_param_case), fsparam_enum ("check", Opt_check, hpfs_param_check), fsparam_enum ("errors", Opt_err, hpfs_param_err), fsparam_enum ("eas", Opt_eas, hpfs_param_eas), fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), fsparam_s32 ("timeshift", Opt_timeshift), {} }; struct hpfs_fc_context { kuid_t uid; kgid_t gid; umode_t umask; int lowercase; int eas; int chk; int errs; int chkdsk; int timeshift; }; static inline void hpfs_help(void) { pr_info("\n\ HPFS filesystem options:\n\ help do not mount and display this text\n\ uid=xxx set uid of files that don't have uid specified in eas\n\ gid=xxx set gid of files that don't have gid specified in eas\n\ umask=xxx set mode of files that don't have mode specified in eas\n\ case=lower lowercase all files\n\ case=asis do not lowercase files (default)\n\ check=none no fs checks - kernel may crash on corrupted filesystem\n\ check=normal do some checks - it should not crash (default)\n\ check=strict do extra time-consuming checks, used for debugging\n\ errors=continue continue on errors\n\ errors=remount-ro remount read-only if errors found (default)\n\ errors=panic panic on errors\n\ chkdsk=no do not mark fs for chkdsking even if there were errors\n\ chkdsk=errors mark fs dirty if errors found (default)\n\ chkdsk=always always mark fs dirty - used for debugging\n\ eas=no ignore extended attributes\n\ eas=ro read but do not write extended attributes\n\ eas=rw r/w eas => enables chmod, chown, mknod, ln -s (default)\n\ timeshift=nnn add nnn seconds to file times\n\ \n"); } static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hpfs_fc_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, hpfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_help: hpfs_help(); return -EINVAL; case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_umask: ctx->umask = result.uint_32; break; case Opt_case: ctx->lowercase = result.uint_32; break; case Opt_check: ctx->chk = result.uint_32; break; case Opt_err: ctx->errs = result.uint_32; break; case Opt_eas: ctx->eas = result.uint_32; break; case Opt_chkdsk: ctx->chkdsk = result.uint_32; break; case Opt_timeshift: { int m = 1; char *rhs = param->string; int timeshift; if (*rhs == '-') m = -1; if (*rhs == '+' || *rhs == '-') rhs++; timeshift = simple_strtoul(rhs, &rhs, 0) * m; if (*rhs) return -EINVAL; ctx->timeshift = timeshift; break; } default: return -EINVAL; } return 0; } static int hpfs_reconfigure(struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); fc->sb_flags |= SB_NOATIME; hpfs_lock(s); if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); return -EINVAL; } static int hpfs_show_options(struct seq_file *seq, struct dentry *root) { struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); if (sbi->sb_lowercase) seq_printf(seq, ",case=lower"); if (!sbi->sb_chk) seq_printf(seq, ",check=none"); if (sbi->sb_chk == 2) seq_printf(seq, ",check=strict"); if (!sbi->sb_err) seq_printf(seq, ",errors=continue"); if (sbi->sb_err == 2) seq_printf(seq, ",errors=panic"); if (!sbi->sb_chkdsk) seq_printf(seq, ",chkdsk=no"); if (sbi->sb_chkdsk == 2) seq_printf(seq, ",chkdsk=always"); if (!sbi->sb_eas) seq_printf(seq, ",eas=no"); if (sbi->sb_eas == 1) seq_printf(seq, ",eas=ro"); if (sbi->sb_timeshift) seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); return 0; } /* Super operations */ static const struct super_operations hpfs_sops = { .alloc_inode = hpfs_alloc_inode, .free_inode = hpfs_free_inode, .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; } s->s_fs_info = sbi; mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1; if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2; if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; /* Check magics */ if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC || le32_to_cpu(spareblock->magic) != SP_MAGIC) { if (!silent) pr_err("Bad magic ... probably not HPFS\n"); goto bail4; } /* Check version */ if (!sb_rdonly(s) && superblock->funcversion != 2 && superblock->funcversion != 3) { pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); goto bail4; } s->s_flags |= SB_NOATIME; /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; set_default_d_op(s, &hpfs_dentry_operations); s->s_time_min = local_to_gmt(s, 0); s->s_time_max = local_to_gmt(s, U32_MAX); sbi->sb_root = le32_to_cpu(superblock->root); sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; sbi->sb_lowercase = ctx->lowercase; sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; sbi->sb_chkdsk = ctx->chkdsk; sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; sbi->sb_max_fwd_alloc = 0xffffff; if (sbi->sb_fs_size >= 0x80000000) { hpfs_error(s, "invalid size in superblock: %08x", (unsigned)sbi->sb_fs_size); goto bail4; } if (spareblock->n_spares_used) hpfs_load_hotfix_map(s, spareblock); /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } hpfs_error(s, "improperly stopped"); } if (!sb_rdonly(s)) { spareblock->dirty = 1; spareblock->old_wrote = 0; mark_buffer_dirty(bh2); } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); goto bail4; } a = sbi->sb_dirband_size; sbi->sb_dirband_size = 0; if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { mark_dirty(s, 0); goto bail4; } sbi->sb_dirband_size = a; } else pr_err("You really don't want any checks? You are crazy...\n"); /* Load code page table */ if (le32_to_cpu(spareblock->n_code_pages)) if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) pr_err("code page support is disabled\n"); brelse(bh2); brelse(bh1); brelse(bh0); root = iget_locked(s, sbi->sb_root); if (!root) goto bail0; hpfs_init_inode(root); hpfs_read_inode(root); unlock_new_inode(root); s->s_root = d_make_root(root); if (!s->s_root) goto bail0; /* * find the root directory's . pointer & finish filling in the inode */ root_dno = hpfs_fnode_dno(s, sbi->sb_root); if (root_dno) de = map_dirent(root, root_dno, "\001\001", 2, NULL, &qbh); if (!de) hpfs_error(s, "unable to find root dir"); else { inode_set_atime(root, local_to_gmt(s, le32_to_cpu(de->read_date)), 0); inode_set_mtime(root, local_to_gmt(s, le32_to_cpu(de->write_date)), 0); inode_set_ctime(root, local_to_gmt(s, le32_to_cpu(de->creation_date)), 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) root->i_size = 2048; if (root->i_blocks == -1) root->i_blocks = 5; hpfs_brelse4(&qbh); } hpfs_unlock(s); return 0; bail4: brelse(bh2); bail3: brelse(bh1); bail2: brelse(bh0); bail1: bail0: hpfs_unlock(s); free_sbi(sbi); return -EINVAL; } static int hpfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, hpfs_fill_super); } static void hpfs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hpfs_fc_context_ops = { .parse_param = hpfs_parse_param, .get_tree = hpfs_get_tree, .reconfigure = hpfs_reconfigure, .free = hpfs_free_fc, }; static int hpfs_init_fs_context(struct fs_context *fc) { struct hpfs_fc_context *ctx; ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(sb); ctx->uid = sbi->sb_uid; ctx->gid = sbi->sb_gid; ctx->umask = 0777 & ~sbi->sb_mode; ctx->lowercase = sbi->sb_lowercase; ctx->eas = sbi->sb_eas; ctx->chk = sbi->sb_chk; ctx->chkdsk = sbi->sb_chkdsk; ctx->errs = sbi->sb_err; ctx->timeshift = sbi->sb_timeshift; } else { ctx->uid = current_uid(); ctx->gid = current_gid(); ctx->umask = current_umask(); ctx->lowercase = 0; ctx->eas = 2; ctx->chk = 1; ctx->errs = 1; ctx->chkdsk = 1; ctx->timeshift = 0; } fc->fs_private = ctx; fc->ops = &hpfs_fc_context_ops; return 0; }; static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = hpfs_init_fs_context, .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&hpfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_hpfs_fs(void) { unregister_filesystem(&hpfs_fs_type); destroy_inodecache(); } module_init(init_hpfs_fs) module_exit(exit_hpfs_fs) MODULE_DESCRIPTION("OS/2 HPFS file system support"); MODULE_LICENSE("GPL");
49 49 46 39 49 49 49 23 49 49 49 49 49 32 43 5 45 32 37 3 49 49 49 49 49 49 53 48 5 1 1 1 1 1 1 1 2 2 52 52 52 52 48 48 49 23 49 48 49 49 49 49 49 53 5 207 203 207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; if (!node) return NULL; rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); data_dir = rq_data_dir(rq); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(opf) && !op_is_write(opf)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; enum dd_prio prio; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return -ENOMEM; eq->elevator_data = dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) per_prio->stats.inserted++; rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (per_prio) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
8 8 7 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which emulate a local clock-event * device via a broadcast event source. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/module.h> #include "tick-internal.h" /* * Broadcast support for broken x86 hardware, where the local apic * timer stops in C3 state. */ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly; static cpumask_var_t tmpmask __cpumask_var_read_mostly; static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } # endif #endif /* * Debugging: see timer_list.c */ struct tick_device *tick_get_broadcast_device(void) { return &tick_broadcast_device; } struct cpumask *tick_get_broadcast_mask(void) { return tick_broadcast_mask; } static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu); const struct clock_event_device *tick_get_wakeup_device(int cpu) { return tick_get_oneshot_wakeup_device(cpu); } /* * Start the device in periodic mode */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { if (bc) tick_setup_periodic(bc, 1); } /* * Check, if the device can be utilized as broadcast device: */ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; return !curdev || newdev->rating > curdev->rating; } #ifdef CONFIG_TICK_ONESHOT static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return per_cpu(tick_oneshot_wakeup_device, cpu); } static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. */ tick_receive_broadcast(); } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu); if (!newdev) goto set_device; if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) || !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) return false; if (curdev && newdev->rating <= curdev->rating) return false; if (!try_module_get(newdev->owner)) return false; newdev->event_handler = tick_oneshot_wakeup_handler; set_device: clockevents_exchange_device(curdev, newdev); per_cpu(tick_oneshot_wakeup_device, cpu) = newdev; return true; } #else static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) { return NULL; } static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, int cpu) { return false; } #endif /* * Conditionally install/replace broadcast device */ void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { struct clock_event_device *cur = tick_broadcast_device.evtdev; if (tick_set_oneshot_wakeup_device(dev, cpu)) return; if (!tick_check_broadcast_device(cur, dev)) return; if (!try_module_get(dev->owner)) return; clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * If the system already runs in oneshot mode, switch the newly * registered broadcast device to oneshot mode explicitly. */ if (tick_broadcast_oneshot_active()) { tick_broadcast_switch_to_oneshot(); return; } /* * Inform all cpus about this. We might be in a situation * where we did not switch to oneshot mode because the per cpu * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack * of a oneshot capable broadcast device. Without that * notification the systems stays stuck in periodic mode * forever. */ tick_clock_notify(); } /* * Check, if the device is the broadcast device */ int tick_is_broadcast_device(struct clock_event_device *dev) { return (dev && tick_broadcast_device.evtdev == dev); } int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { int ret = -ENODEV; if (tick_is_broadcast_device(dev)) { raw_spin_lock(&tick_broadcast_lock); ret = __clockevents_update_freq(dev, freq); raw_spin_unlock(&tick_broadcast_lock); } return ret; } static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); } static void tick_device_setup_broadcast_func(struct clock_event_device *dev) { if (!dev->broadcast) dev->broadcast = tick_broadcast; if (!dev->broadcast) { pr_warn_once("%s depends on broadcast, but no broadcast function available\n", dev->name); dev->broadcast = err_broadcast; } } /* * Check, if the device is dysfunctional and a placeholder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot * mode disabled. This signals, that the device needs to be * operated from the broadcast device and is a placeholder for * the cpu local device. */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* * Clear the broadcast bit for this cpu if the * device is not power state affected. */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); else tick_device_setup_broadcast_func(dev); /* * Clear the broadcast bit if the CPU is not in * periodic broadcast on state. */ if (!cpumask_test_cpu(cpu, tick_broadcast_on)) cpumask_clear_cpu(cpu, tick_broadcast_mask); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_ONESHOT: /* * If the system is in oneshot mode we can * unconditionally clear the oneshot mask bit, * because the CPU is running and therefore * not in an idle state which causes the power * state affected device to stop. Let the * caller initialize the device. */ tick_broadcast_clear_oneshot(cpu); ret = 0; break; case TICKDEV_MODE_PERIODIC: /* * If the system is in periodic mode, check * whether the broadcast device can be * switched off now. */ if (cpumask_empty(tick_broadcast_mask) && bc) clockevents_shutdown(bc); /* * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt * is delivered by the broadcast device, if * the broadcast device exists and is not * hrtimer based. */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } int tick_receive_broadcast(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *evt = td->evtdev; if (!evt) return -ENODEV; if (!evt->event_handler) return -EINVAL; evt->event_handler(evt); return 0; } /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { struct clock_event_device *bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, mask); /* * We only run the local handler, if the broadcast * device is not hrtimer based. Otherwise we run into * a hrtimer recursion. * * local timer_interrupt() * local_handler() * expire_hrtimers() * bc_handler() * local_handler() * expire_hrtimers() */ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); return tick_do_broadcast(tmpmask); } /* * Event handler for periodic broadcast ticks */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool bc_local; raw_spin_lock(&tick_broadcast_lock); /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { raw_spin_unlock(&tick_broadcast_lock); return; } bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); clockevents_program_event(dev, next, true); } raw_spin_unlock(&tick_broadcast_lock); /* * We run the handler of the local cpu after dropping * tick_broadcast_lock because the handler might deadlock when * trying to switch to oneshot mode. */ if (bc_local) td->evtdev->event_handler(td->evtdev); } /** * tick_broadcast_control - Enable/disable or force broadcast mode * @mode: The selected broadcast mode * * Called when the system enters a state where affected tick devices * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. */ void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; int cpu, bc_stopped; unsigned long flags; /* Protects also the local clockevent device. */ raw_spin_lock_irqsave(&tick_broadcast_lock, flags); td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) goto out; if (!tick_device_is_functional(dev)) goto out; cpu = smp_processor_id(); bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { /* * Only shutdown the cpu local device, if: * * - the broadcast device exists * - the broadcast device is not a hrtimer based one * - the broadcast device is in periodic mode to * avoid a hiccup during switch to oneshot mode */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; case TICK_BROADCAST_OFF: if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; } if (bc) { if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else tick_broadcast_setup_oneshot(bc, false); } } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off */ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) { if (!broadcast) dev->event_handler = tick_handle_periodic; else dev->event_handler = tick_handle_periodic_broadcast; } #ifdef CONFIG_HOTPLUG_CPU static void tick_shutdown_broadcast(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } } /* * Remove a CPU from broadcasting */ void tick_broadcast_offline(unsigned int cpu) { raw_spin_lock(&tick_broadcast_lock); cpumask_clear_cpu(cpu, tick_broadcast_mask); cpumask_clear_cpu(cpu, tick_broadcast_on); tick_broadcast_oneshot_offline(cpu); tick_shutdown_broadcast(); raw_spin_unlock(&tick_broadcast_lock); } #endif void tick_suspend_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * This is called from tick_resume_local() on a resuming CPU. That's * called from the core resume function, tick_unfreeze() and the magic XEN * resume hackery. * * In none of these cases the broadcast device mode can change and the * bit of the resuming CPU in the broadcast mask is safe as well. */ bool tick_resume_check_broadcast(void) { if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) return false; else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); } void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) { clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) tick_resume_broadcast_oneshot(bc); break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly; static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { return tick_broadcast_oneshot_mask; } /* * Called before going idle with interrupts disabled. Checks whether a * broadcast event from the other core is about to happen. We detected * that in tick_broadcast_oneshot_control(). The callsite can use this * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ noinstr int tick_check_broadcast_expired(void) { #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); #else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); #endif } /* * Set broadcast interrupt affinity */ static void tick_broadcast_set_affinity(struct clock_event_device *bc, const struct cpumask *cpumask) { if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) return; if (cpumask_equal(bc->cpumask, cpumask)) return; bc->cpumask = cpumask; irq_set_affinity(bc->irq, bc->cpumask); } static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires) { if (!clockevent_state_oneshot(bc)) clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(bc, expires, 1); tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from * periodic to oneshot. If the CPU has not yet * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } } /* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; bool bc_local; raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { /* * Required for !SMP because for_each_cpu() reports * unconditionally CPU0 as set on UP kernels. */ if (!IS_ENABLED(CONFIG_SMP) && cpumask_empty(tick_broadcast_oneshot_mask)) break; td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); /* * Mark the remote cpu in the pending mask, so * it can avoid reprogramming the cpu local * timer in tick_broadcast_oneshot_control(). */ cpumask_set_cpu(cpu, tick_broadcast_pending_mask); } else if (td->evtdev->next_event < next_event) { next_event = td->evtdev->next_event; next_cpu = cpu; } } /* * Remove the current cpu from the pending mask. The event is * delivered immediately in tick_do_broadcast() ! */ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); /* * Sanity check. Catch the case where we try to broadcast to * offline cpus. */ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) cpumask_and(tmpmask, tmpmask, cpu_online_mask); /* * Wakeup the cpus which have an expired event. */ bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: * * - The global event did not expire any CPU local * events. This happens in dyntick mode, as the maximum PIT * delta is quite small. * * - There are pending events on sleeping CPUs which were not * in the event mask */ if (next_event != KTIME_MAX) tick_broadcast_set_event(dev, next_cpu, next_event); raw_spin_unlock(&tick_broadcast_lock); if (bc_local) { td = this_cpu_ptr(&tick_cpu_device); td->evtdev->event_handler(td->evtdev); } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) { if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) return 0; if (bc->next_event == KTIME_MAX) return 0; return bc->bound_on == cpu ? -EBUSY : 0; } static void broadcast_shutdown_local(struct clock_event_device *bc, struct clock_event_device *dev) { /* * For hrtimer based broadcasting we cannot shutdown the cpu * local device if our own event is the first one to expire or * if we own the broadcast timer. */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { if (broadcast_needs_cpu(bc, smp_processor_id())) return; if (dev->next_event < bc->next_event) return; } clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *bc, *dev = td->evtdev; int ret = 0; ktime_t now; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; if (state == TICK_BROADCAST_ENTER) { /* * If the current CPU owns the hrtimer broadcast * mechanism, it cannot go deep idle and we do not add * the CPU to the broadcast mask. We don't have to go * through the EXIT path as the local timer is not * shutdown. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) goto out; /* * If the broadcast device is in periodic mode, we * return. */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { /* If it is a hrtimer based broadcast, return busy */ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) ret = -EBUSY; goto out; } if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be * woken by the IPI right away; we return * busy, so the CPU does not try to go deep * idle. */ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { ret = -EBUSY; } else if (dev->next_event < bc->next_event) { tick_broadcast_set_event(bc, cpu, dev->next_event); /* * In case of hrtimer broadcasts the * programming might have moved the * timer to this cpu. If yes, remove * us from the broadcast mask and * return busy. */ ret = broadcast_needs_cpu(bc, cpu); if (ret) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } } } } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast * pending mask and fired the broadcast * IPI. So we are going to handle the expired * event anyway via the broadcast IPI * handler. No need to reprogram the timer * with an already expired event. */ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_pending_mask)) goto out; /* * Bail out if there is no next event. */ if (dev->next_event == KTIME_MAX) goto out; /* * If the pending bit is not set, then we are * either the CPU handling the broadcast * interrupt or we got woken by something else. * * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event * will happen in the future and depending on * the min_delta setting this might be far * enough out that the ping-pong starts. * * If the cpu local next_event has expired * then we know that the broadcast timer * next_event has expired as well and * broadcast is about to be handled. So we * avoid reprogramming and enforce that the * broadcast handler, which did not run yet, * will invoke the cpu local handler. * * We cannot call the handler directly from * here, because we might be in a NOHZ phase * and we did not go through the irq_enter() * nohz fixups. */ now = ktime_get(); if (dev->next_event <= now) { cpumask_set_cpu(cpu, tick_broadcast_force_mask); goto out; } /* * We got woken by something else. Reprogram * the cpu local timer device. */ tick_program_event(dev->next_event, 1); } } out: raw_spin_unlock(&tick_broadcast_lock); return ret; } static int tick_oneshot_wakeup_control(enum tick_broadcast_state state, struct tick_device *td, int cpu) { struct clock_event_device *dev, *wd; dev = td->evtdev; if (td->mode != TICKDEV_MODE_ONESHOT) return -EINVAL; wd = tick_get_oneshot_wakeup_device(cpu); if (!wd) return -ENODEV; switch (state) { case TICK_BROADCAST_ENTER: clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(wd, dev->next_event, 1); break; case TICK_BROADCAST_EXIT: /* We may have transitioned to oneshot mode while idle */ if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT) return -ENODEV; } return 0; } int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); if (!tick_oneshot_wakeup_control(state, td, cpu)) return 0; if (tick_broadcast_device.evtdev) return ___tick_broadcast_oneshot_control(state, td, cpu); /* * If there is no broadcast or wakeup device, tell the caller not * to go into deep idle. */ return -EBUSY; } /* * Reset the one shot broadcast for a cpu * * Called with tick_broadcast_lock held */ static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, ktime_t expires) { struct tick_device *td; int cpu; for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; } } static inline ktime_t tick_get_next_period(void) { ktime_t next; /* * Protect against concurrent updates (store /load tearing on * 32bit). It does not matter if the time is already in the * past. The broadcast device which is about to be programmed will * fire in any case. */ raw_spin_lock(&jiffies_lock); next = tick_next_period; raw_spin_unlock(&jiffies_lock); return next; } /** * tick_broadcast_setup_oneshot - setup the broadcast device * @bc: the broadcast device * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { int cpu = smp_processor_id(); ktime_t nexttick = 0; if (!bc) return; /* * When the broadcast device was switched to oneshot by the first * CPU handling the NOHZ change, the other CPUs will reach this * code via hrtimer_run_queues() -> tick_check_oneshot_change() * too. Set up the broadcast device only once! */ if (bc->event_handler == tick_handle_oneshot_broadcast) { /* * The CPU which switched from periodic to oneshot mode * set the broadcast oneshot bit for all other CPUs which * are in the general (periodic) broadcast mask to ensure * that CPUs which wait for the periodic broadcast are * woken up. * * Clear the bit for the local CPU as the set bit would * prevent the first tick_broadcast_enter() after this CPU * switched to oneshot state to program the broadcast * device. * * This code can also be reached via tick_broadcast_control(), * but this cannot avoid the tick_broadcast_clear_oneshot() * as that would break the periodic to oneshot transition of * secondary CPUs. But that's harmless as the below only * clears already cleared bits. */ tick_broadcast_clear_oneshot(cpu); return; } bc->event_handler = tick_handle_oneshot_broadcast; bc->next_event = KTIME_MAX; /* * When the tick mode is switched from periodic to oneshot it must * be ensured that CPUs which are waiting for periodic broadcast * get their wake-up at the next tick. This is achieved by ORing * tick_broadcast_mask into tick_broadcast_oneshot_mask. * * For other callers, e.g. broadcast device replacement, * tick_broadcast_oneshot_mask must not be touched as this would * set bits for CPUs which are already NOHZ, but not idle. Their * next tick_broadcast_enter() would observe the bit set and fail * to update the expiry time and the broadcast event device. */ if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); /* * Ensure that the oneshot broadcast handler will wake the * CPUs which are still waiting for periodic broadcast. */ nexttick = tick_get_next_period(); tick_broadcast_init_next_event(tmpmask, nexttick); /* * If the underlying broadcast clock event device is * already in oneshot state, then there is nothing to do. * The device was already armed for the next tick * in tick_handle_broadcast_periodic() */ if (clockevent_state_oneshot(bc)) return; } /* * When switching from periodic to oneshot mode arm the broadcast * device for the next tick. * * If the broadcast device has been replaced in oneshot mode and * the oneshot broadcast mask is not empty, then arm it to expire * immediately in order to reevaluate the next expiring timer. * @nexttick is 0 and therefore in the past which will cause the * clockevent code to force an event. * * For both cases the programming can be avoided when the oneshot * broadcast mask is empty. * * tick_broadcast_set_event() implicitly switches the broadcast * device to oneshot state. */ if (!cpumask_empty(tick_broadcast_oneshot_mask)) tick_broadcast_set_event(bc, cpu, nexttick); } /* * Select oneshot operating mode for the broadcast device */ void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { struct clock_event_device *bc; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { /* * If the broadcast force bit of the current CPU is set, * then the current CPU has not yet reprogrammed the local * timer device to avoid a ping-pong race. See * ___tick_broadcast_oneshot_control(). * * If the broadcast device is hrtimer based then * programming the broadcast event below does not have any * effect because the local clockevent device is not * running and not programmed because the broadcast event * is not earlier than the pending event of the local clock * event device. As a consequence all CPUs waiting for a * broadcast event are stuck forever. * * Detect this condition and reprogram the cpu local timer * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* * Remove a dying CPU from broadcasting */ static void tick_broadcast_oneshot_offline(unsigned int cpu) { if (tick_get_oneshot_wakeup_device(cpu)) tick_set_oneshot_wakeup_device(NULL, cpu); /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); } #endif /* * Check, whether the broadcast device is in one shot mode */ int tick_broadcast_oneshot_active(void) { return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } /* * Check whether the broadcast device supports oneshot. */ bool tick_broadcast_oneshot_available(void) { struct clock_event_device *bc = tick_broadcast_device.evtdev; return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } #else int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc = tick_broadcast_device.evtdev; if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) return -EBUSY; return 0; } #endif void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif }
848 641 42 214 18 348 17 232 194 44 2 229 232 9 94 76 50 11 179 50 3 3 84 85 1 1 80 6 74 74 80 51 32 6 6 8 8 98 99 97 98 99 18 81 86 86 1 1 25 24 2 25 24 21 19 1 18 16 17 1 4 1 6 5 5 3 8 8 8 1 32 2 1 32 30 6 25 4 16 8 22 36 18 10 5 10 2 3 7 5 9 3 24 6 13 10 7 4 3 7 7 1 6 4 104 101 1 4 4 5 3 16 3 8 6 48 48 40 1 1 1 33 11 16 24 7 1801 1666 152 369 122 26 455 112 689 3 2 7 7 2 3 7 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason; const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev)); if (drop_reason != SKB_NOT_DROPPED_YET) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) { drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; goto freeskb; } memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb_reason(skb, drop_reason); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl_net(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_net_lock(net); err = arp_req_delete(net, &r); rtnl_net_unlock(net); break; case SIOCSARP: rtnl_net_lock(net); err = arp_req_set(net, &r); rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
1 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 9 6 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 29 29 15 5 1 8 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 // SPDX-License-Identifier: GPL-2.0-or-later /* * USB Wacom tablet support - system specific code */ #include "wacom_wac.h" #include "wacom.h" #include <linux/input/mt.h> #define WAC_MSG_RETRIES 5 #define WAC_CMD_RETRIES 10 #define DEV_ATTR_RW_PERM (S_IRUGO | S_IWUSR | S_IWGRP) #define DEV_ATTR_WO_PERM (S_IWUSR | S_IWGRP) #define DEV_ATTR_RO_PERM (S_IRUSR | S_IRGRP) static int wacom_get_report(struct hid_device *hdev, u8 type, u8 *buf, size_t size, unsigned int retries) { int retval; do { retval = hid_hw_raw_request(hdev, buf[0], buf, size, type, HID_REQ_GET_REPORT); } while ((retval == -ETIMEDOUT || retval == -EAGAIN) && --retries); if (retval < 0) hid_err(hdev, "wacom_get_report: ran out of retries " "(last error = %d)\n", retval); return retval; } static int wacom_set_report(struct hid_device *hdev, u8 type, u8 *buf, size_t size, unsigned int retries) { int retval; do { retval = hid_hw_raw_request(hdev, buf[0], buf, size, type, HID_REQ_SET_REPORT); } while ((retval == -ETIMEDOUT || retval == -EAGAIN) && --retries); if (retval < 0) hid_err(hdev, "wacom_set_report: ran out of retries " "(last error = %d)\n", retval); return retval; } static void wacom_wac_queue_insert(struct hid_device *hdev, struct kfifo_rec_ptr_2 *fifo, u8 *raw_data, int size) { bool warned = false; while (kfifo_avail(fifo) < size) { if (!warned) hid_warn(hdev, "%s: kfifo has filled, starting to drop events\n", __func__); warned = true; kfifo_skip(fifo); } kfifo_in(fifo, raw_data, size); } static void wacom_wac_queue_flush(struct hid_device *hdev, struct kfifo_rec_ptr_2 *fifo) { while (!kfifo_is_empty(fifo)) { int size = kfifo_peek_len(fifo); u8 *buf; unsigned int count; int err; buf = kzalloc(size, GFP_KERNEL); if (!buf) { kfifo_skip(fifo); continue; } count = kfifo_out(fifo, buf, size); if (count != size) { // Hard to say what is the "right" action in this // circumstance. Skipping the entry and continuing // to flush seems reasonable enough, however. hid_warn(hdev, "%s: removed fifo entry with unexpected size\n", __func__); kfree(buf); continue; } err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false); if (err) { hid_warn(hdev, "%s: unable to flush event due to error %d\n", __func__, err); } kfree(buf); } } static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, struct hid_report *report, u8 *raw_data, int report_size) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; bool flush = false; bool insert = false; int i, j; if (wacom_wac->serial[0] || !(features->quirks & WACOM_QUIRK_TOOLSERIAL)) return 0; /* Queue events which have invalid tool type or serial number */ for (i = 0; i < report->maxfield; i++) { for (j = 0; j < report->field[i]->maxusage; j++) { struct hid_field *field = report->field[i]; struct hid_usage *usage = &field->usage[j]; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); unsigned int offset; unsigned int size; unsigned int value; if (equivalent_usage != HID_DG_INRANGE && equivalent_usage != HID_DG_TOOLSERIALNUMBER && equivalent_usage != WACOM_HID_WD_SERIALHI && equivalent_usage != WACOM_HID_WD_TOOLTYPE) continue; offset = field->report_offset; size = field->report_size; value = hid_field_extract(hdev, raw_data+1, offset + j * size, size); /* If we go out of range, we need to flush the queue ASAP */ if (equivalent_usage == HID_DG_INRANGE) value = !value; if (value) { flush = true; switch (equivalent_usage) { case HID_DG_TOOLSERIALNUMBER: wacom_wac->serial[0] = value; break; case WACOM_HID_WD_SERIALHI: wacom_wac->serial[0] |= ((__u64)value) << 32; break; case WACOM_HID_WD_TOOLTYPE: wacom_wac->id[0] = value; break; } } else { insert = true; } } } if (flush) wacom_wac_queue_flush(hdev, wacom_wac->pen_fifo); else if (insert) wacom_wac_queue_insert(hdev, wacom_wac->pen_fifo, raw_data, report_size); return insert && !flush; } static int wacom_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *raw_data, int size) { struct wacom *wacom = hid_get_drvdata(hdev); if (wacom->wacom_wac.features.type == BOOTLOADER) return 0; if (wacom_wac_pen_serial_enforce(hdev, report, raw_data, size)) return -1; wacom->wacom_wac.data = raw_data; wacom_wac_irq(&wacom->wacom_wac, size); return 0; } static int wacom_open(struct input_dev *dev) { struct wacom *wacom = input_get_drvdata(dev); return hid_hw_open(wacom->hdev); } static void wacom_close(struct input_dev *dev) { struct wacom *wacom = input_get_drvdata(dev); /* * wacom->hdev should never be null, but surprisingly, I had the case * once while unplugging the Wacom Wireless Receiver. */ if (wacom->hdev) hid_hw_close(wacom->hdev); } /* * Calculate the resolution of the X or Y axis using hidinput_calc_abs_res. */ static int wacom_calc_hid_res(int logical_extents, int physical_extents, unsigned unit, int exponent) { struct hid_field field = { .logical_maximum = logical_extents, .physical_maximum = physical_extents, .unit = unit, .unit_exponent = exponent, }; return hidinput_calc_abs_res(&field, ABS_X); } static void wacom_hid_usage_quirk(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_features *features = &wacom->wacom_wac.features; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); /* * The Dell Canvas 27 needs to be switched to its vendor-defined * report to provide the best resolution. */ if (hdev->vendor == USB_VENDOR_ID_WACOM && hdev->product == 0x4200 && field->application == HID_UP_MSVENDOR) { wacom->wacom_wac.mode_report = field->report->id; wacom->wacom_wac.mode_value = 2; } /* * ISDv4 devices which predate HID's adoption of the * HID_DG_BARELSWITCH2 usage use 0x000D0000 in its * position instead. We can accurately detect if a * usage with that value should be HID_DG_BARRELSWITCH2 * based on the surrounding usages, which have remained * constant across generations. */ if (features->type == HID_GENERIC && usage->hid == 0x000D0000 && field->application == HID_DG_PEN && field->physical == HID_DG_STYLUS) { int i = usage->usage_index; if (i-4 >= 0 && i+1 < field->maxusage && field->usage[i-4].hid == HID_DG_TIPSWITCH && field->usage[i-3].hid == HID_DG_BARRELSWITCH && field->usage[i-2].hid == HID_DG_ERASER && field->usage[i-1].hid == HID_DG_INVERT && field->usage[i+1].hid == HID_DG_INRANGE) { usage->hid = HID_DG_BARRELSWITCH2; } } /* * Wacom's AES devices use different vendor-defined usages to * report serial number information compared to their branded * hardware. The usages are also sometimes ill-defined and do * not have the correct logical min/max values set. Lets patch * the descriptor to use the branded usage convention and fix * the errors. */ if (usage->hid == WACOM_HID_WT_SERIALNUMBER && field->report_size == 16 && field->index + 2 < field->report->maxfield) { struct hid_field *a = field->report->field[field->index + 1]; struct hid_field *b = field->report->field[field->index + 2]; if (a->maxusage > 0 && a->usage[0].hid == HID_DG_TOOLSERIALNUMBER && a->report_size == 32 && b->maxusage > 0 && b->usage[0].hid == 0xFF000000 && b->report_size == 8) { features->quirks |= WACOM_QUIRK_AESPEN; usage->hid = WACOM_HID_WD_TOOLTYPE; field->logical_minimum = S16_MIN; field->logical_maximum = S16_MAX; a->logical_minimum = S32_MIN; a->logical_maximum = S32_MAX; b->usage[0].hid = WACOM_HID_WD_SERIALHI; b->logical_minimum = 0; b->logical_maximum = U8_MAX; } } /* 2nd-generation Intuos Pro Large has incorrect Y maximum */ if (hdev->vendor == USB_VENDOR_ID_WACOM && hdev->product == 0x0358 && WACOM_PEN_FIELD(field) && equivalent_usage == HID_GD_Y) { field->logical_maximum = 43200; } } static void wacom_feature_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_features *features = &wacom->wacom_wac.features; struct hid_data *hid_data = &wacom->wacom_wac.hid_data; unsigned int equivalent_usage = wacom_equivalent_usage(usage->hid); u8 *data; int ret; u32 n; wacom_hid_usage_quirk(hdev, field, usage); switch (equivalent_usage) { case WACOM_HID_WD_TOUCH_RING_SETTING: wacom->generic_has_leds = true; break; case HID_DG_CONTACTMAX: /* leave touch_max as is if predefined */ if (!features->touch_max) { /* read manually */ n = hid_report_len(field->report); data = hid_alloc_report_buf(field->report, GFP_KERNEL); if (!data) break; data[0] = field->report->id; ret = wacom_get_report(hdev, HID_FEATURE_REPORT, data, n, WAC_CMD_RETRIES); if (ret == n && features->type == HID_GENERIC) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, data, n, 0); } else if (ret == 2 && features->type != HID_GENERIC) { features->touch_max = data[1]; } else { features->touch_max = 16; hid_warn(hdev, "wacom_feature_mapping: " "could not get HID_DG_CONTACTMAX, " "defaulting to %d\n", features->touch_max); } kfree(data); } break; case HID_DG_INPUTMODE: /* Ignore if value index is out of bounds. */ if (usage->usage_index >= field->report_count) { dev_err(&hdev->dev, "HID_DG_INPUTMODE out of range\n"); break; } hid_data->inputmode = field->report->id; hid_data->inputmode_index = usage->usage_index; break; case HID_UP_DIGITIZER: if (field->report->id == 0x0B && (field->application == WACOM_HID_G9_PEN || field->application == WACOM_HID_G11_PEN)) { wacom->wacom_wac.mode_report = field->report->id; wacom->wacom_wac.mode_value = 0; } break; case WACOM_HID_WD_DATAMODE: wacom->wacom_wac.mode_report = field->report->id; wacom->wacom_wac.mode_value = 2; break; case WACOM_HID_UP_G9: case WACOM_HID_UP_G11: if (field->report->id == 0x03 && (field->application == WACOM_HID_G9_TOUCHSCREEN || field->application == WACOM_HID_G11_TOUCHSCREEN)) { wacom->wacom_wac.mode_report = field->report->id; wacom->wacom_wac.mode_value = 0; } break; case WACOM_HID_WD_OFFSETLEFT: case WACOM_HID_WD_OFFSETTOP: case WACOM_HID_WD_OFFSETRIGHT: case WACOM_HID_WD_OFFSETBOTTOM: /* read manually */ n = hid_report_len(field->report); data = hid_alloc_report_buf(field->report, GFP_KERNEL); if (!data) break; data[0] = field->report->id; ret = wacom_get_report(hdev, HID_FEATURE_REPORT, data, n, WAC_CMD_RETRIES); if (ret == n) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, data, n, 0); } else { hid_warn(hdev, "%s: could not retrieve sensor offsets\n", __func__); } kfree(data); break; } } /* * Interface Descriptor of wacom devices can be incomplete and * inconsistent so wacom_features table is used to store stylus * device's packet lengths, various maximum values, and tablet * resolution based on product ID's. * * For devices that contain 2 interfaces, wacom_features table is * inaccurate for the touch interface. Since the Interface Descriptor * for touch interfaces has pretty complete data, this function exists * to query tablet for this missing information instead of hard coding in * an additional table. * * A typical Interface Descriptor for a stylus will contain a * boot mouse application collection that is not of interest and this * function will ignore it. * * It also contains a digitizer application collection that also is not * of interest since any information it contains would be duplicate * of what is in wacom_features. Usually it defines a report of an array * of bytes that could be used as max length of the stylus packet returned. * If it happens to define a Digitizer-Stylus Physical Collection then * the X and Y logical values contain valid data but it is ignored. * * A typical Interface Descriptor for a touch interface will contain a * Digitizer-Finger Physical Collection which will define both logical * X/Y maximum as well as the physical size of tablet. Since touch * interfaces haven't supported pressure or distance, this is enough * information to override invalid values in the wacom_features table. * * Intuos5 touch interface and 3rd gen Bamboo Touch do not contain useful * data. We deal with them after returning from this function. */ static void wacom_usage_mapping(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_features *features = &wacom->wacom_wac.features; bool finger = WACOM_FINGER_FIELD(field); bool pen = WACOM_PEN_FIELD(field); unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); /* * Requiring Stylus Usage will ignore boot mouse * X/Y values and some cases of invalid Digitizer X/Y * values commonly reported. */ if (pen) features->device_type |= WACOM_DEVICETYPE_PEN; else if (finger) features->device_type |= WACOM_DEVICETYPE_TOUCH; else return; wacom_hid_usage_quirk(hdev, field, usage); switch (equivalent_usage) { case HID_GD_X: features->x_max = field->logical_maximum; if (finger) { features->x_phy = field->physical_maximum; if ((features->type != BAMBOO_PT) && (features->type != BAMBOO_TOUCH)) { features->unit = field->unit; features->unitExpo = field->unit_exponent; } } break; case HID_GD_Y: features->y_max = field->logical_maximum; if (finger) { features->y_phy = field->physical_maximum; if ((features->type != BAMBOO_PT) && (features->type != BAMBOO_TOUCH)) { features->unit = field->unit; features->unitExpo = field->unit_exponent; } } break; case HID_DG_TIPPRESSURE: if (pen) features->pressure_max = field->logical_maximum; break; } if (features->type == HID_GENERIC) wacom_wac_usage_mapping(hdev, field, usage); } static void wacom_post_parse_hid(struct hid_device *hdev, struct wacom_features *features) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; if (features->type == HID_GENERIC) { /* Any last-minute generic device setup */ if (wacom_wac->has_mode_change) { if (wacom_wac->is_direct_mode) features->device_type |= WACOM_DEVICETYPE_DIRECT; else features->device_type &= ~WACOM_DEVICETYPE_DIRECT; } if (features->touch_max > 1) { if (features->device_type & WACOM_DEVICETYPE_DIRECT) input_mt_init_slots(wacom_wac->touch_input, wacom_wac->features.touch_max, INPUT_MT_DIRECT); else input_mt_init_slots(wacom_wac->touch_input, wacom_wac->features.touch_max, INPUT_MT_POINTER); } } } static void wacom_parse_hid(struct hid_device *hdev, struct wacom_features *features) { struct hid_report_enum *rep_enum; struct hid_report *hreport; int i, j; /* check features first */ rep_enum = &hdev->report_enum[HID_FEATURE_REPORT]; list_for_each_entry(hreport, &rep_enum->report_list, list) { for (i = 0; i < hreport->maxfield; i++) { /* Ignore if report count is out of bounds. */ if (hreport->field[i]->report_count < 1) continue; for (j = 0; j < hreport->field[i]->maxusage; j++) { wacom_feature_mapping(hdev, hreport->field[i], hreport->field[i]->usage + j); } } } /* now check the input usages */ rep_enum = &hdev->report_enum[HID_INPUT_REPORT]; list_for_each_entry(hreport, &rep_enum->report_list, list) { if (!hreport->maxfield) continue; for (i = 0; i < hreport->maxfield; i++) for (j = 0; j < hreport->field[i]->maxusage; j++) wacom_usage_mapping(hdev, hreport->field[i], hreport->field[i]->usage + j); } wacom_post_parse_hid(hdev, features); } static int wacom_hid_set_device_mode(struct hid_device *hdev) { struct wacom *wacom = hid_get_drvdata(hdev); struct hid_data *hid_data = &wacom->wacom_wac.hid_data; struct hid_report *r; struct hid_report_enum *re; if (hid_data->inputmode < 0) return 0; re = &(hdev->report_enum[HID_FEATURE_REPORT]); r = re->report_id_hash[hid_data->inputmode]; if (r) { r->field[0]->value[hid_data->inputmode_index] = 2; hid_hw_request(hdev, r, HID_REQ_SET_REPORT); } return 0; } static int wacom_set_device_mode(struct hid_device *hdev, struct wacom_wac *wacom_wac) { u8 *rep_data; struct hid_report *r; struct hid_report_enum *re; u32 length; int error = -ENOMEM, limit = 0; if (wacom_wac->mode_report < 0) return 0; re = &(hdev->report_enum[HID_FEATURE_REPORT]); r = re->report_id_hash[wacom_wac->mode_report]; if (!r) return -EINVAL; rep_data = hid_alloc_report_buf(r, GFP_KERNEL); if (!rep_data) return -ENOMEM; length = hid_report_len(r); do { rep_data[0] = wacom_wac->mode_report; rep_data[1] = wacom_wac->mode_value; error = wacom_set_report(hdev, HID_FEATURE_REPORT, rep_data, length, 1); if (error >= 0) error = wacom_get_report(hdev, HID_FEATURE_REPORT, rep_data, length, 1); } while (error >= 0 && rep_data[1] != wacom_wac->mode_report && limit++ < WAC_MSG_RETRIES); kfree(rep_data); return error < 0 ? error : 0; } static int wacom_bt_query_tablet_data(struct hid_device *hdev, u8 speed, struct wacom_features *features) { struct wacom *wacom = hid_get_drvdata(hdev); int ret; u8 rep_data[2]; switch (features->type) { case GRAPHIRE_BT: rep_data[0] = 0x03; rep_data[1] = 0x00; ret = wacom_set_report(hdev, HID_FEATURE_REPORT, rep_data, 2, 3); if (ret >= 0) { rep_data[0] = speed == 0 ? 0x05 : 0x06; rep_data[1] = 0x00; ret = wacom_set_report(hdev, HID_FEATURE_REPORT, rep_data, 2, 3); if (ret >= 0) { wacom->wacom_wac.bt_high_speed = speed; return 0; } } /* * Note that if the raw queries fail, it's not a hard failure * and it is safe to continue */ hid_warn(hdev, "failed to poke device, command %d, err %d\n", rep_data[0], ret); break; case INTUOS4WL: if (speed == 1) wacom->wacom_wac.bt_features &= ~0x20; else wacom->wacom_wac.bt_features |= 0x20; rep_data[0] = 0x03; rep_data[1] = wacom->wacom_wac.bt_features; ret = wacom_set_report(hdev, HID_FEATURE_REPORT, rep_data, 2, 1); if (ret >= 0) wacom->wacom_wac.bt_high_speed = speed; break; } return 0; } /* * Switch the tablet into its most-capable mode. Wacom tablets are * typically configured to power-up in a mode which sends mouse-like * reports to the OS. To get absolute position, pressure data, etc. * from the tablet, it is necessary to switch the tablet out of this * mode and into one which sends the full range of tablet data. */ static int _wacom_query_tablet_data(struct wacom *wacom) { struct hid_device *hdev = wacom->hdev; struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; if (hdev->bus == BUS_BLUETOOTH) return wacom_bt_query_tablet_data(hdev, 1, features); if (features->type != HID_GENERIC) { if (features->device_type & WACOM_DEVICETYPE_TOUCH) { if (features->type > TABLETPC) { /* MT Tablet PC touch */ wacom_wac->mode_report = 3; wacom_wac->mode_value = 4; } else if (features->type == WACOM_24HDT) { wacom_wac->mode_report = 18; wacom_wac->mode_value = 2; } else if (features->type == WACOM_27QHDT) { wacom_wac->mode_report = 131; wacom_wac->mode_value = 2; } else if (features->type == BAMBOO_PAD) { wacom_wac->mode_report = 2; wacom_wac->mode_value = 2; } } else if (features->device_type & WACOM_DEVICETYPE_PEN) { if (features->type <= BAMBOO_PT) { wacom_wac->mode_report = 2; wacom_wac->mode_value = 2; } } } wacom_set_device_mode(hdev, wacom_wac); if (features->type == HID_GENERIC) return wacom_hid_set_device_mode(hdev); return 0; } static void wacom_retrieve_hid_descriptor(struct hid_device *hdev, struct wacom_features *features) { struct wacom *wacom = hid_get_drvdata(hdev); struct usb_interface *intf = wacom->intf; /* default features */ features->x_fuzz = 4; features->y_fuzz = 4; features->pressure_fuzz = 0; features->distance_fuzz = 1; features->tilt_fuzz = 1; /* * The wireless device HID is basic and layout conflicts with * other tablets (monitor and touch interface can look like pen). * Skip the query for this type and modify defaults based on * interface number. */ if (features->type == WIRELESS && intf) { if (intf->cur_altsetting->desc.bInterfaceNumber == 0) features->device_type = WACOM_DEVICETYPE_WL_MONITOR; else features->device_type = WACOM_DEVICETYPE_NONE; return; } wacom_parse_hid(hdev, features); } struct wacom_hdev_data { struct list_head list; struct kref kref; struct hid_device *dev; struct wacom_shared shared; }; static LIST_HEAD(wacom_udev_list); static DEFINE_MUTEX(wacom_udev_list_lock); static bool wacom_are_sibling(struct hid_device *hdev, struct hid_device *sibling) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_features *features = &wacom->wacom_wac.features; struct wacom *sibling_wacom = hid_get_drvdata(sibling); struct wacom_features *sibling_features = &sibling_wacom->wacom_wac.features; __u32 oVid = features->oVid ? features->oVid : hdev->vendor; __u32 oPid = features->oPid ? features->oPid : hdev->product; /* The defined oVid/oPid must match that of the sibling */ if (features->oVid != HID_ANY_ID && sibling->vendor != oVid) return false; if (features->oPid != HID_ANY_ID && sibling->product != oPid) return false; /* * Devices with the same VID/PID must share the same physical * device path, while those with different VID/PID must share * the same physical parent device path. */ if (hdev->vendor == sibling->vendor && hdev->product == sibling->product) { if (!hid_compare_device_paths(hdev, sibling, '/')) return false; } else { if (!hid_compare_device_paths(hdev, sibling, '.')) return false; } /* Skip the remaining heuristics unless you are a HID_GENERIC device */ if (features->type != HID_GENERIC) return true; /* * Direct-input devices may not be siblings of indirect-input * devices. */ if ((features->device_type & WACOM_DEVICETYPE_DIRECT) && !(sibling_features->device_type & WACOM_DEVICETYPE_DIRECT)) return false; /* * Indirect-input devices may not be siblings of direct-input * devices. */ if (!(features->device_type & WACOM_DEVICETYPE_DIRECT) && (sibling_features->device_type & WACOM_DEVICETYPE_DIRECT)) return false; /* Pen devices may only be siblings of touch devices */ if ((features->device_type & WACOM_DEVICETYPE_PEN) && !(sibling_features->device_type & WACOM_DEVICETYPE_TOUCH)) return false; /* Touch devices may only be siblings of pen devices */ if ((features->device_type & WACOM_DEVICETYPE_TOUCH) && !(sibling_features->device_type & WACOM_DEVICETYPE_PEN)) return false; /* * No reason could be found for these two devices to NOT be * siblings, so there's a good chance they ARE siblings */ return true; } static struct wacom_hdev_data *wacom_get_hdev_data(struct hid_device *hdev) { struct wacom_hdev_data *data; /* Try to find an already-probed interface from the same device */ list_for_each_entry(data, &wacom_udev_list, list) { if (hid_compare_device_paths(hdev, data->dev, '/')) { kref_get(&data->kref); return data; } } /* Fallback to finding devices that appear to be "siblings" */ list_for_each_entry(data, &wacom_udev_list, list) { if (wacom_are_sibling(hdev, data->dev)) { kref_get(&data->kref); return data; } } return NULL; } static void wacom_release_shared_data(struct kref *kref) { struct wacom_hdev_data *data = container_of(kref, struct wacom_hdev_data, kref); mutex_lock(&wacom_udev_list_lock); list_del(&data->list); mutex_unlock(&wacom_udev_list_lock); kfree(data); } static void wacom_remove_shared_data(void *res) { struct wacom *wacom = res; struct wacom_hdev_data *data; struct wacom_wac *wacom_wac = &wacom->wacom_wac; if (wacom_wac->shared) { data = container_of(wacom_wac->shared, struct wacom_hdev_data, shared); if (wacom_wac->shared->touch == wacom->hdev) wacom_wac->shared->touch = NULL; else if (wacom_wac->shared->pen == wacom->hdev) wacom_wac->shared->pen = NULL; kref_put(&data->kref, wacom_release_shared_data); wacom_wac->shared = NULL; } } static int wacom_add_shared_data(struct hid_device *hdev) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_hdev_data *data; int retval = 0; mutex_lock(&wacom_udev_list_lock); data = wacom_get_hdev_data(hdev); if (!data) { data = kzalloc(sizeof(struct wacom_hdev_data), GFP_KERNEL); if (!data) { mutex_unlock(&wacom_udev_list_lock); return -ENOMEM; } kref_init(&data->kref); data->dev = hdev; list_add_tail(&data->list, &wacom_udev_list); } mutex_unlock(&wacom_udev_list_lock); wacom_wac->shared = &data->shared; retval = devm_add_action_or_reset(&hdev->dev, wacom_remove_shared_data, wacom); if (retval) return retval; if (wacom_wac->features.device_type & WACOM_DEVICETYPE_TOUCH) wacom_wac->shared->touch = hdev; else if (wacom_wac->features.device_type & WACOM_DEVICETYPE_PEN) wacom_wac->shared->pen = hdev; return retval; } static int wacom_led_control(struct wacom *wacom) { unsigned char *buf; int retval; unsigned char report_id = WAC_CMD_LED_CONTROL; int buf_size = 9; if (!wacom->led.groups) return -ENOTSUPP; if (wacom->wacom_wac.features.type == REMOTE) return -ENOTSUPP; if (wacom->wacom_wac.pid) { /* wireless connected */ report_id = WAC_CMD_WL_LED_CONTROL; buf_size = 13; } else if (wacom->wacom_wac.features.type == INTUOSP2_BT) { report_id = WAC_CMD_WL_INTUOSP2; buf_size = 51; } buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; if (wacom->wacom_wac.features.type == HID_GENERIC) { buf[0] = WAC_CMD_LED_CONTROL_GENERIC; buf[1] = wacom->led.llv; buf[2] = wacom->led.groups[0].select & 0x03; } else if ((wacom->wacom_wac.features.type >= INTUOS5S && wacom->wacom_wac.features.type <= INTUOSPL)) { /* * Touch Ring and crop mark LED luminance may take on * one of four values: * 0 = Low; 1 = Medium; 2 = High; 3 = Off */ int ring_led = wacom->led.groups[0].select & 0x03; int ring_lum = (((wacom->led.llv & 0x60) >> 5) - 1) & 0x03; int crop_lum = 0; unsigned char led_bits = (crop_lum << 4) | (ring_lum << 2) | (ring_led); buf[0] = report_id; if (wacom->wacom_wac.pid) { wacom_get_report(wacom->hdev, HID_FEATURE_REPORT, buf, buf_size, WAC_CMD_RETRIES); buf[0] = report_id; buf[4] = led_bits; } else buf[1] = led_bits; } else if (wacom->wacom_wac.features.type == INTUOSP2_BT) { buf[0] = report_id; buf[4] = 100; // Power Connection LED (ORANGE) buf[5] = 100; // BT Connection LED (BLUE) buf[6] = 100; // Paper Mode (RED?) buf[7] = 100; // Paper Mode (GREEN?) buf[8] = 100; // Paper Mode (BLUE?) buf[9] = wacom->led.llv; buf[10] = wacom->led.groups[0].select & 0x03; } else { int led = wacom->led.groups[0].select | 0x4; if (wacom->wacom_wac.features.type == WACOM_21UX2 || wacom->wacom_wac.features.type == WACOM_24HD) led |= (wacom->led.groups[1].select << 4) | 0x40; buf[0] = report_id; buf[1] = led; buf[2] = wacom->led.llv; buf[3] = wacom->led.hlv; buf[4] = wacom->led.img_lum; } retval = wacom_set_report(wacom->hdev, HID_FEATURE_REPORT, buf, buf_size, WAC_CMD_RETRIES); kfree(buf); return retval; } static int wacom_led_putimage(struct wacom *wacom, int button_id, u8 xfer_id, const unsigned len, const void *img) { unsigned char *buf; int i, retval; const unsigned chunk_len = len / 4; /* 4 chunks are needed to be sent */ buf = kzalloc(chunk_len + 3 , GFP_KERNEL); if (!buf) return -ENOMEM; /* Send 'start' command */ buf[0] = WAC_CMD_ICON_START; buf[1] = 1; retval = wacom_set_report(wacom->hdev, HID_FEATURE_REPORT, buf, 2, WAC_CMD_RETRIES); if (retval < 0) goto out; buf[0] = xfer_id; buf[1] = button_id & 0x07; for (i = 0; i < 4; i++) { buf[2] = i; memcpy(buf + 3, img + i * chunk_len, chunk_len); retval = wacom_set_report(wacom->hdev, HID_FEATURE_REPORT, buf, chunk_len + 3, WAC_CMD_RETRIES); if (retval < 0) break; } /* Send 'stop' */ buf[0] = WAC_CMD_ICON_START; buf[1] = 0; wacom_set_report(wacom->hdev, HID_FEATURE_REPORT, buf, 2, WAC_CMD_RETRIES); out: kfree(buf); return retval; } static ssize_t wacom_led_select_store(struct device *dev, int set_id, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); unsigned int id; int err; err = kstrtouint(buf, 10, &id); if (err) return err; mutex_lock(&wacom->lock); wacom->led.groups[set_id].select = id & 0x3; err = wacom_led_control(wacom); mutex_unlock(&wacom->lock); return err < 0 ? err : count; } #define DEVICE_LED_SELECT_ATTR(SET_ID) \ static ssize_t wacom_led##SET_ID##_select_store(struct device *dev, \ struct device_attribute *attr, const char *buf, size_t count) \ { \ return wacom_led_select_store(dev, SET_ID, buf, count); \ } \ static ssize_t wacom_led##SET_ID##_select_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct hid_device *hdev = to_hid_device(dev);\ struct wacom *wacom = hid_get_drvdata(hdev); \ return scnprintf(buf, PAGE_SIZE, "%d\n", \ wacom->led.groups[SET_ID].select); \ } \ static DEVICE_ATTR(status_led##SET_ID##_select, DEV_ATTR_RW_PERM, \ wacom_led##SET_ID##_select_show, \ wacom_led##SET_ID##_select_store) DEVICE_LED_SELECT_ATTR(0); DEVICE_LED_SELECT_ATTR(1); static ssize_t wacom_luminance_store(struct wacom *wacom, u8 *dest, const char *buf, size_t count) { unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err) return err; mutex_lock(&wacom->lock); *dest = value & 0x7f; for (unsigned int i = 0; i < wacom->led.count; i++) { struct wacom_group_leds *group = &wacom->led.groups[i]; for (unsigned int j = 0; j < group->count; j++) { if (dest == &wacom->led.llv) group->leds[j].llv = *dest; else if (dest == &wacom->led.hlv) group->leds[j].hlv = *dest; } } err = wacom_led_control(wacom); mutex_unlock(&wacom->lock); return err < 0 ? err : count; } #define DEVICE_LUMINANCE_ATTR(name, field) \ static ssize_t wacom_##name##_luminance_store(struct device *dev, \ struct device_attribute *attr, const char *buf, size_t count) \ { \ struct hid_device *hdev = to_hid_device(dev);\ struct wacom *wacom = hid_get_drvdata(hdev); \ \ return wacom_luminance_store(wacom, &wacom->led.field, \ buf, count); \ } \ static ssize_t wacom_##name##_luminance_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wacom *wacom = dev_get_drvdata(dev); \ return scnprintf(buf, PAGE_SIZE, "%d\n", wacom->led.field); \ } \ static DEVICE_ATTR(name##_luminance, DEV_ATTR_RW_PERM, \ wacom_##name##_luminance_show, \ wacom_##name##_luminance_store) DEVICE_LUMINANCE_ATTR(status0, llv); DEVICE_LUMINANCE_ATTR(status1, hlv); DEVICE_LUMINANCE_ATTR(buttons, img_lum); static ssize_t wacom_button_image_store(struct device *dev, int button_id, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); int err; unsigned len; u8 xfer_id; if (hdev->bus == BUS_BLUETOOTH) { len = 256; xfer_id = WAC_CMD_ICON_BT_XFER; } else { len = 1024; xfer_id = WAC_CMD_ICON_XFER; } if (count != len) return -EINVAL; mutex_lock(&wacom->lock); err = wacom_led_putimage(wacom, button_id, xfer_id, len, buf); mutex_unlock(&wacom->lock); return err < 0 ? err : count; } #define DEVICE_BTNIMG_ATTR(BUTTON_ID) \ static ssize_t wacom_btnimg##BUTTON_ID##_store(struct device *dev, \ struct device_attribute *attr, const char *buf, size_t count) \ { \ return wacom_button_image_store(dev, BUTTON_ID, buf, count); \ } \ static DEVICE_ATTR(button##BUTTON_ID##_rawimg, DEV_ATTR_WO_PERM, \ NULL, wacom_btnimg##BUTTON_ID##_store) DEVICE_BTNIMG_ATTR(0); DEVICE_BTNIMG_ATTR(1); DEVICE_BTNIMG_ATTR(2); DEVICE_BTNIMG_ATTR(3); DEVICE_BTNIMG_ATTR(4); DEVICE_BTNIMG_ATTR(5); DEVICE_BTNIMG_ATTR(6); DEVICE_BTNIMG_ATTR(7); static struct attribute *cintiq_led_attrs[] = { &dev_attr_status_led0_select.attr, &dev_attr_status_led1_select.attr, NULL }; static const struct attribute_group cintiq_led_attr_group = { .name = "wacom_led", .attrs = cintiq_led_attrs, }; static struct attribute *intuos4_led_attrs[] = { &dev_attr_status0_luminance.attr, &dev_attr_status1_luminance.attr, &dev_attr_status_led0_select.attr, &dev_attr_buttons_luminance.attr, &dev_attr_button0_rawimg.attr, &dev_attr_button1_rawimg.attr, &dev_attr_button2_rawimg.attr, &dev_attr_button3_rawimg.attr, &dev_attr_button4_rawimg.attr, &dev_attr_button5_rawimg.attr, &dev_attr_button6_rawimg.attr, &dev_attr_button7_rawimg.attr, NULL }; static const struct attribute_group intuos4_led_attr_group = { .name = "wacom_led", .attrs = intuos4_led_attrs, }; static struct attribute *intuos5_led_attrs[] = { &dev_attr_status0_luminance.attr, &dev_attr_status_led0_select.attr, NULL }; static const struct attribute_group intuos5_led_attr_group = { .name = "wacom_led", .attrs = intuos5_led_attrs, }; static struct attribute *generic_led_attrs[] = { &dev_attr_status0_luminance.attr, &dev_attr_status_led0_select.attr, NULL }; static const struct attribute_group generic_led_attr_group = { .name = "wacom_led", .attrs = generic_led_attrs, }; struct wacom_sysfs_group_devres { const struct attribute_group *group; struct kobject *root; }; static void wacom_devm_sysfs_group_release(struct device *dev, void *res) { struct wacom_sysfs_group_devres *devres = res; struct kobject *kobj = devres->root; dev_dbg(dev, "%s: dropping reference to %s\n", __func__, devres->group->name); sysfs_remove_group(kobj, devres->group); } static int __wacom_devm_sysfs_create_group(struct wacom *wacom, struct kobject *root, const struct attribute_group *group) { struct wacom_sysfs_group_devres *devres; int error; devres = devres_alloc(wacom_devm_sysfs_group_release, sizeof(struct wacom_sysfs_group_devres), GFP_KERNEL); if (!devres) return -ENOMEM; devres->group = group; devres->root = root; error = sysfs_create_group(devres->root, group); if (error) { devres_free(devres); return error; } devres_add(&wacom->hdev->dev, devres); return 0; } static int wacom_devm_sysfs_create_group(struct wacom *wacom, const struct attribute_group *group) { return __wacom_devm_sysfs_create_group(wacom, &wacom->hdev->dev.kobj, group); } static void wacom_devm_kfifo_release(struct device *dev, void *res) { struct kfifo_rec_ptr_2 *devres = res; kfifo_free(devres); } static int wacom_devm_kfifo_alloc(struct wacom *wacom) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; int fifo_size = min(PAGE_SIZE, 10 * wacom_wac->features.pktlen); struct kfifo_rec_ptr_2 *pen_fifo; int error; pen_fifo = devres_alloc(wacom_devm_kfifo_release, sizeof(struct kfifo_rec_ptr_2), GFP_KERNEL); if (!pen_fifo) return -ENOMEM; error = kfifo_alloc(pen_fifo, fifo_size, GFP_KERNEL); if (error) { devres_free(pen_fifo); return error; } devres_add(&wacom->hdev->dev, pen_fifo); wacom_wac->pen_fifo = pen_fifo; return 0; } enum led_brightness wacom_leds_brightness_get(struct wacom_led *led) { struct wacom *wacom = led->wacom; if (wacom->led.max_hlv) return wacom_rescale(led->hlv, wacom->led.max_hlv, LED_FULL); if (wacom->led.max_llv) return wacom_rescale(led->llv, wacom->led.max_llv, LED_FULL); /* device doesn't support brightness tuning */ return LED_FULL; } static enum led_brightness __wacom_led_brightness_get(struct led_classdev *cdev) { struct wacom_led *led = container_of(cdev, struct wacom_led, cdev); struct wacom *wacom = led->wacom; if (wacom->led.groups[led->group].select != led->id) return LED_OFF; return wacom_leds_brightness_get(led); } static int wacom_led_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) { struct wacom_led *led = container_of(cdev, struct wacom_led, cdev); struct wacom *wacom = led->wacom; int error; mutex_lock(&wacom->lock); if (!wacom->led.groups || (brightness == LED_OFF && wacom->led.groups[led->group].select != led->id)) { error = 0; goto out; } led->llv = wacom->led.llv = wacom_rescale(brightness, LED_FULL, wacom->led.max_llv); led->hlv = wacom->led.hlv = wacom_rescale(brightness, LED_FULL, wacom->led.max_hlv); wacom->led.groups[led->group].select = led->id; error = wacom_led_control(wacom); out: mutex_unlock(&wacom->lock); return error; } static void wacom_led_readonly_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) { } static int wacom_led_register_one(struct device *dev, struct wacom *wacom, struct wacom_led *led, unsigned int group, unsigned int id, bool read_only) { int error; char *name; name = devm_kasprintf(dev, GFP_KERNEL, "%s::wacom-%d.%d", dev_name(dev), group, id); if (!name) return -ENOMEM; led->group = group; led->id = id; led->wacom = wacom; led->llv = wacom->led.llv; led->hlv = wacom->led.hlv; led->cdev.name = name; led->cdev.max_brightness = LED_FULL; led->cdev.flags = LED_HW_PLUGGABLE; led->cdev.brightness_get = __wacom_led_brightness_get; if (!read_only) { led->cdev.brightness_set_blocking = wacom_led_brightness_set; led->cdev.default_trigger = led->cdev.name; } else { led->cdev.brightness_set = wacom_led_readonly_brightness_set; } if (!read_only) { led->trigger.name = name; if (id == wacom->led.groups[group].select) led->trigger.brightness = wacom_leds_brightness_get(led); error = devm_led_trigger_register(dev, &led->trigger); if (error) { hid_err(wacom->hdev, "failed to register LED trigger %s: %d\n", led->cdev.name, error); return error; } } error = devm_led_classdev_register(dev, &led->cdev); if (error) { hid_err(wacom->hdev, "failed to register LED %s: %d\n", led->cdev.name, error); led->cdev.name = NULL; return error; } return 0; } static void wacom_led_groups_release_one(void *data) { struct wacom_group_leds *group = data; devres_release_group(group->dev, group); } static int wacom_led_groups_alloc_and_register_one(struct device *dev, struct wacom *wacom, int group_id, int count, bool read_only) { struct wacom_led *leds; int i, error; if (group_id >= wacom->led.count || count <= 0) return -EINVAL; if (!devres_open_group(dev, &wacom->led.groups[group_id], GFP_KERNEL)) return -ENOMEM; leds = devm_kcalloc(dev, count, sizeof(struct wacom_led), GFP_KERNEL); if (!leds) { error = -ENOMEM; goto err; } wacom->led.groups[group_id].leds = leds; wacom->led.groups[group_id].count = count; for (i = 0; i < count; i++) { error = wacom_led_register_one(dev, wacom, &leds[i], group_id, i, read_only); if (error) goto err; } wacom->led.groups[group_id].dev = dev; devres_close_group(dev, &wacom->led.groups[group_id]); /* * There is a bug (?) in devm_led_classdev_register() in which its * increments the refcount of the parent. If the parent is an input * device, that means the ref count never reaches 0 when * devm_input_device_release() gets called. * This means that the LEDs are still there after disconnect. * Manually force the release of the group so that the leds are released * once we are done using them. */ error = devm_add_action_or_reset(&wacom->hdev->dev, wacom_led_groups_release_one, &wacom->led.groups[group_id]); if (error) return error; return 0; err: devres_release_group(dev, &wacom->led.groups[group_id]); return error; } struct wacom_led *wacom_led_find(struct wacom *wacom, unsigned int group_id, unsigned int id) { struct wacom_group_leds *group; if (group_id >= wacom->led.count) return NULL; group = &wacom->led.groups[group_id]; if (!group->leds) return NULL; id %= group->count; return &group->leds[id]; } /* * wacom_led_next: gives the next available led with a wacom trigger. * * returns the next available struct wacom_led which has its default trigger * or the current one if none is available. */ struct wacom_led *wacom_led_next(struct wacom *wacom, struct wacom_led *cur) { struct wacom_led *next_led; int group, next; if (!wacom || !cur) return NULL; group = cur->group; next = cur->id; do { next_led = wacom_led_find(wacom, group, ++next); if (!next_led || next_led == cur) return next_led; } while (next_led->cdev.trigger != &next_led->trigger); return next_led; } static void wacom_led_groups_release(void *data) { struct wacom *wacom = data; wacom->led.groups = NULL; wacom->led.count = 0; } static int wacom_led_groups_allocate(struct wacom *wacom, int count) { struct device *dev = &wacom->hdev->dev; struct wacom_group_leds *groups; int error; groups = devm_kcalloc(dev, count, sizeof(struct wacom_group_leds), GFP_KERNEL); if (!groups) return -ENOMEM; error = devm_add_action_or_reset(dev, wacom_led_groups_release, wacom); if (error) return error; wacom->led.groups = groups; wacom->led.count = count; return 0; } static int wacom_leds_alloc_and_register(struct wacom *wacom, int group_count, int led_per_group, bool read_only) { struct device *dev; int i, error; if (!wacom->wacom_wac.pad_input) return -EINVAL; dev = &wacom->wacom_wac.pad_input->dev; error = wacom_led_groups_allocate(wacom, group_count); if (error) return error; for (i = 0; i < group_count; i++) { error = wacom_led_groups_alloc_and_register_one(dev, wacom, i, led_per_group, read_only); if (error) return error; } return 0; } int wacom_initialize_leds(struct wacom *wacom) { int error; if (!(wacom->wacom_wac.features.device_type & WACOM_DEVICETYPE_PAD)) return 0; /* Initialize default values */ switch (wacom->wacom_wac.features.type) { case HID_GENERIC: if (!wacom->generic_has_leds) return 0; wacom->led.llv = 100; wacom->led.max_llv = 100; error = wacom_leds_alloc_and_register(wacom, 1, 4, false); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } error = wacom_devm_sysfs_create_group(wacom, &generic_led_attr_group); break; case INTUOS4S: case INTUOS4: case INTUOS4WL: case INTUOS4L: wacom->led.llv = 10; wacom->led.hlv = 20; wacom->led.max_llv = 127; wacom->led.max_hlv = 127; wacom->led.img_lum = 10; error = wacom_leds_alloc_and_register(wacom, 1, 4, false); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } error = wacom_devm_sysfs_create_group(wacom, &intuos4_led_attr_group); break; case WACOM_24HD: case WACOM_21UX2: wacom->led.llv = 0; wacom->led.hlv = 0; wacom->led.img_lum = 0; error = wacom_leds_alloc_and_register(wacom, 2, 4, false); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } error = wacom_devm_sysfs_create_group(wacom, &cintiq_led_attr_group); break; case INTUOS5S: case INTUOS5: case INTUOS5L: case INTUOSPS: case INTUOSPM: case INTUOSPL: wacom->led.llv = 32; wacom->led.max_llv = 96; error = wacom_leds_alloc_and_register(wacom, 1, 4, false); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } error = wacom_devm_sysfs_create_group(wacom, &intuos5_led_attr_group); break; case INTUOSP2_BT: wacom->led.llv = 50; wacom->led.max_llv = 100; error = wacom_leds_alloc_and_register(wacom, 1, 4, false); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } return 0; case REMOTE: wacom->led.llv = 255; wacom->led.max_llv = 255; error = wacom_led_groups_allocate(wacom, 5); if (error) { hid_err(wacom->hdev, "cannot create leds err: %d\n", error); return error; } return 0; default: return 0; } if (error) { hid_err(wacom->hdev, "cannot create sysfs group err: %d\n", error); return error; } return 0; } static void wacom_init_work(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, init_work.work); _wacom_query_tablet_data(wacom); wacom_led_control(wacom); } static void wacom_query_tablet_data(struct wacom *wacom) { schedule_delayed_work(&wacom->init_work, msecs_to_jiffies(1000)); } static enum power_supply_property wacom_battery_props[] = { POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_SCOPE, POWER_SUPPLY_PROP_CAPACITY }; static int wacom_battery_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) { struct wacom_battery *battery = power_supply_get_drvdata(psy); int ret = 0; switch (psp) { case POWER_SUPPLY_PROP_MODEL_NAME: val->strval = battery->wacom->wacom_wac.name; break; case POWER_SUPPLY_PROP_PRESENT: val->intval = battery->bat_connected; break; case POWER_SUPPLY_PROP_SCOPE: val->intval = POWER_SUPPLY_SCOPE_DEVICE; break; case POWER_SUPPLY_PROP_CAPACITY: val->intval = battery->battery_capacity; break; case POWER_SUPPLY_PROP_STATUS: if (battery->bat_status != WACOM_POWER_SUPPLY_STATUS_AUTO) val->intval = battery->bat_status; else if (battery->bat_charging) val->intval = POWER_SUPPLY_STATUS_CHARGING; else if (battery->battery_capacity == 100 && battery->ps_connected) val->intval = POWER_SUPPLY_STATUS_FULL; else if (battery->ps_connected) val->intval = POWER_SUPPLY_STATUS_NOT_CHARGING; else val->intval = POWER_SUPPLY_STATUS_DISCHARGING; break; default: ret = -EINVAL; break; } return ret; } static int __wacom_initialize_battery(struct wacom *wacom, struct wacom_battery *battery) { static atomic_t battery_no = ATOMIC_INIT(0); struct device *dev = &wacom->hdev->dev; struct power_supply_config psy_cfg = { .drv_data = battery, }; struct power_supply *ps_bat; struct power_supply_desc *bat_desc = &battery->bat_desc; unsigned long n; int error; if (!devres_open_group(dev, bat_desc, GFP_KERNEL)) return -ENOMEM; battery->wacom = wacom; n = atomic_inc_return(&battery_no) - 1; bat_desc->properties = wacom_battery_props; bat_desc->num_properties = ARRAY_SIZE(wacom_battery_props); bat_desc->get_property = wacom_battery_get_property; sprintf(battery->bat_name, "wacom_battery_%ld", n); bat_desc->name = battery->bat_name; bat_desc->type = POWER_SUPPLY_TYPE_BATTERY; bat_desc->use_for_apm = 0; ps_bat = devm_power_supply_register(dev, bat_desc, &psy_cfg); if (IS_ERR(ps_bat)) { error = PTR_ERR(ps_bat); goto err; } power_supply_powers(ps_bat, &wacom->hdev->dev); battery->battery = ps_bat; devres_close_group(dev, bat_desc); return 0; err: devres_release_group(dev, bat_desc); return error; } static int wacom_initialize_battery(struct wacom *wacom) { if (wacom->wacom_wac.features.quirks & WACOM_QUIRK_BATTERY) return __wacom_initialize_battery(wacom, &wacom->battery); return 0; } static void wacom_destroy_battery(struct wacom *wacom) { if (wacom->battery.battery) { devres_release_group(&wacom->hdev->dev, &wacom->battery.bat_desc); wacom->battery.battery = NULL; } } static void wacom_aes_battery_handler(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, aes_battery_work.work); wacom_destroy_battery(wacom); } static ssize_t wacom_show_speed(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); return sysfs_emit(buf, "%i\n", wacom->wacom_wac.bt_high_speed); } static ssize_t wacom_store_speed(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); u8 new_speed; if (kstrtou8(buf, 0, &new_speed)) return -EINVAL; if (new_speed != 0 && new_speed != 1) return -EINVAL; wacom_bt_query_tablet_data(hdev, new_speed, &wacom->wacom_wac.features); return count; } static DEVICE_ATTR(speed, DEV_ATTR_RW_PERM, wacom_show_speed, wacom_store_speed); static ssize_t wacom_show_remote_mode(struct kobject *kobj, struct kobj_attribute *kattr, char *buf, int index) { struct device *dev = kobj_to_dev(kobj->parent); struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); u8 mode; mode = wacom->led.groups[index].select; return sprintf(buf, "%d\n", mode < 3 ? mode : -1); } #define DEVICE_EKR_ATTR_GROUP(SET_ID) \ static ssize_t wacom_show_remote##SET_ID##_mode(struct kobject *kobj, \ struct kobj_attribute *kattr, char *buf) \ { \ return wacom_show_remote_mode(kobj, kattr, buf, SET_ID); \ } \ static struct kobj_attribute remote##SET_ID##_mode_attr = { \ .attr = {.name = "remote_mode", \ .mode = DEV_ATTR_RO_PERM}, \ .show = wacom_show_remote##SET_ID##_mode, \ }; \ static struct attribute *remote##SET_ID##_serial_attrs[] = { \ &remote##SET_ID##_mode_attr.attr, \ NULL \ }; \ static const struct attribute_group remote##SET_ID##_serial_group = { \ .name = NULL, \ .attrs = remote##SET_ID##_serial_attrs, \ } DEVICE_EKR_ATTR_GROUP(0); DEVICE_EKR_ATTR_GROUP(1); DEVICE_EKR_ATTR_GROUP(2); DEVICE_EKR_ATTR_GROUP(3); DEVICE_EKR_ATTR_GROUP(4); static int wacom_remote_create_attr_group(struct wacom *wacom, __u32 serial, int index) { int error = 0; struct wacom_remote *remote = wacom->remote; remote->remotes[index].group.name = devm_kasprintf(&wacom->hdev->dev, GFP_KERNEL, "%d", serial); if (!remote->remotes[index].group.name) return -ENOMEM; error = __wacom_devm_sysfs_create_group(wacom, remote->remote_dir, &remote->remotes[index].group); if (error) { remote->remotes[index].group.name = NULL; hid_err(wacom->hdev, "cannot create sysfs group err: %d\n", error); return error; } return 0; } static int wacom_cmd_unpair_remote(struct wacom *wacom, unsigned char selector) { const size_t buf_size = 2; unsigned char *buf; int retval; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = WAC_CMD_DELETE_PAIRING; buf[1] = selector; retval = wacom_set_report(wacom->hdev, HID_OUTPUT_REPORT, buf, buf_size, WAC_CMD_RETRIES); kfree(buf); return retval; } static ssize_t wacom_store_unpair_remote(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned char selector = 0; struct device *dev = kobj_to_dev(kobj->parent); struct hid_device *hdev = to_hid_device(dev); struct wacom *wacom = hid_get_drvdata(hdev); int err; if (!strncmp(buf, "*\n", 2)) { selector = WAC_CMD_UNPAIR_ALL; } else { hid_info(wacom->hdev, "remote: unrecognized unpair code: %s\n", buf); return -1; } mutex_lock(&wacom->lock); err = wacom_cmd_unpair_remote(wacom, selector); mutex_unlock(&wacom->lock); return err < 0 ? err : count; } static struct kobj_attribute unpair_remote_attr = { .attr = {.name = "unpair_remote", .mode = 0200}, .store = wacom_store_unpair_remote, }; static const struct attribute *remote_unpair_attrs[] = { &unpair_remote_attr.attr, NULL }; static void wacom_remotes_destroy(void *data) { struct wacom *wacom = data; struct wacom_remote *remote = wacom->remote; if (!remote) return; kobject_put(remote->remote_dir); kfifo_free(&remote->remote_fifo); wacom->remote = NULL; } static int wacom_initialize_remotes(struct wacom *wacom) { int error = 0; struct wacom_remote *remote; int i; if (wacom->wacom_wac.features.type != REMOTE) return 0; remote = devm_kzalloc(&wacom->hdev->dev, sizeof(*wacom->remote), GFP_KERNEL); if (!remote) return -ENOMEM; wacom->remote = remote; spin_lock_init(&remote->remote_lock); error = kfifo_alloc(&remote->remote_fifo, 5 * sizeof(struct wacom_remote_work_data), GFP_KERNEL); if (error) { hid_err(wacom->hdev, "failed allocating remote_fifo\n"); return -ENOMEM; } remote->remotes[0].group = remote0_serial_group; remote->remotes[1].group = remote1_serial_group; remote->remotes[2].group = remote2_serial_group; remote->remotes[3].group = remote3_serial_group; remote->remotes[4].group = remote4_serial_group; remote->remote_dir = kobject_create_and_add("wacom_remote", &wacom->hdev->dev.kobj); if (!remote->remote_dir) { kfifo_free(&remote->remote_fifo); return -ENOMEM; } error = sysfs_create_files(remote->remote_dir, remote_unpair_attrs); if (error) { hid_err(wacom->hdev, "cannot create sysfs group err: %d\n", error); kfifo_free(&remote->remote_fifo); kobject_put(remote->remote_dir); return error; } for (i = 0; i < WACOM_MAX_REMOTES; i++) { wacom->led.groups[i].select = WACOM_STATUS_UNKNOWN; remote->remotes[i].serial = 0; } error = devm_add_action_or_reset(&wacom->hdev->dev, wacom_remotes_destroy, wacom); if (error) return error; return 0; } static struct input_dev *wacom_allocate_input(struct wacom *wacom) { struct input_dev *input_dev; struct hid_device *hdev = wacom->hdev; struct wacom_wac *wacom_wac = &(wacom->wacom_wac); input_dev = devm_input_allocate_device(&hdev->dev); if (!input_dev) return NULL; input_dev->name = wacom_wac->features.name; input_dev->phys = hdev->phys; input_dev->dev.parent = &hdev->dev; input_dev->open = wacom_open; input_dev->close = wacom_close; input_dev->uniq = hdev->uniq; input_dev->id.bustype = hdev->bus; input_dev->id.vendor = hdev->vendor; input_dev->id.product = wacom_wac->pid ? wacom_wac->pid : hdev->product; input_dev->id.version = hdev->version; input_set_drvdata(input_dev, wacom); return input_dev; } static int wacom_allocate_inputs(struct wacom *wacom) { struct wacom_wac *wacom_wac = &(wacom->wacom_wac); wacom_wac->pen_input = wacom_allocate_input(wacom); wacom_wac->touch_input = wacom_allocate_input(wacom); wacom_wac->pad_input = wacom_allocate_input(wacom); if (!wacom_wac->pen_input || !wacom_wac->touch_input || !wacom_wac->pad_input) return -ENOMEM; wacom_wac->pen_input->name = wacom_wac->pen_name; wacom_wac->touch_input->name = wacom_wac->touch_name; wacom_wac->pad_input->name = wacom_wac->pad_name; return 0; } static int wacom_setup_inputs(struct wacom *wacom) { struct input_dev *pen_input_dev, *touch_input_dev, *pad_input_dev; struct wacom_wac *wacom_wac = &(wacom->wacom_wac); int error = 0; pen_input_dev = wacom_wac->pen_input; touch_input_dev = wacom_wac->touch_input; pad_input_dev = wacom_wac->pad_input; if (!pen_input_dev || !touch_input_dev || !pad_input_dev) return -EINVAL; error = wacom_setup_pen_input_capabilities(pen_input_dev, wacom_wac); if (error) { /* no pen in use on this interface */ input_free_device(pen_input_dev); wacom_wac->pen_input = NULL; pen_input_dev = NULL; } error = wacom_setup_touch_input_capabilities(touch_input_dev, wacom_wac); if (error) { /* no touch in use on this interface */ input_free_device(touch_input_dev); wacom_wac->touch_input = NULL; touch_input_dev = NULL; } error = wacom_setup_pad_input_capabilities(pad_input_dev, wacom_wac); if (error) { /* no pad events using this interface */ input_free_device(pad_input_dev); wacom_wac->pad_input = NULL; pad_input_dev = NULL; } return 0; } static int wacom_register_inputs(struct wacom *wacom) { struct input_dev *pen_input_dev, *touch_input_dev, *pad_input_dev; struct wacom_wac *wacom_wac = &(wacom->wacom_wac); int error = 0; pen_input_dev = wacom_wac->pen_input; touch_input_dev = wacom_wac->touch_input; pad_input_dev = wacom_wac->pad_input; if (pen_input_dev) { error = input_register_device(pen_input_dev); if (error) goto fail; } if (touch_input_dev) { error = input_register_device(touch_input_dev); if (error) goto fail; } if (pad_input_dev) { error = input_register_device(pad_input_dev); if (error) goto fail; } return 0; fail: wacom_wac->pad_input = NULL; wacom_wac->touch_input = NULL; wacom_wac->pen_input = NULL; return error; } /* * Not all devices report physical dimensions from HID. * Compute the default from hardcoded logical dimension * and resolution before driver overwrites them. */ static void wacom_set_default_phy(struct wacom_features *features) { if (features->x_resolution) { features->x_phy = (features->x_max * 100) / features->x_resolution; features->y_phy = (features->y_max * 100) / features->y_resolution; } } static void wacom_calculate_res(struct wacom_features *features) { /* set unit to "100th of a mm" for devices not reported by HID */ if (!features->unit) { features->unit = 0x11; features->unitExpo = -3; } features->x_resolution = wacom_calc_hid_res(features->x_max, features->x_phy, features->unit, features->unitExpo); features->y_resolution = wacom_calc_hid_res(features->y_max, features->y_phy, features->unit, features->unitExpo); } void wacom_battery_work(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, battery_work); if ((wacom->wacom_wac.features.quirks & WACOM_QUIRK_BATTERY) && !wacom->battery.battery) { wacom_initialize_battery(wacom); } else if (!(wacom->wacom_wac.features.quirks & WACOM_QUIRK_BATTERY) && wacom->battery.battery) { wacom_destroy_battery(wacom); } } static size_t wacom_compute_pktlen(struct hid_device *hdev) { struct hid_report_enum *report_enum; struct hid_report *report; size_t size = 0; report_enum = hdev->report_enum + HID_INPUT_REPORT; list_for_each_entry(report, &report_enum->report_list, list) { size_t report_size = hid_report_len(report); if (report_size > size) size = report_size; } return size; } static void wacom_update_name(struct wacom *wacom, const char *suffix) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; char name[WACOM_NAME_MAX - 20]; /* Leave some room for suffixes */ /* Generic devices name unspecified */ if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) { char *product_name = wacom->hdev->name; if (hid_is_usb(wacom->hdev)) { struct usb_interface *intf = to_usb_interface(wacom->hdev->dev.parent); struct usb_device *dev = interface_to_usbdev(intf); if (dev->product != NULL) product_name = dev->product; } if (wacom->hdev->bus == BUS_I2C) { snprintf(name, sizeof(name), "%s %X", features->name, wacom->hdev->product); } else if (strstr(product_name, "Wacom") || strstr(product_name, "wacom") || strstr(product_name, "WACOM")) { if (strscpy(name, product_name, sizeof(name)) < 0) { hid_warn(wacom->hdev, "String overflow while assembling device name"); } } else { snprintf(name, sizeof(name), "Wacom %s", product_name); } /* strip out excess whitespaces */ while (1) { char *gap = strstr(name, " "); if (gap == NULL) break; /* shift everything including the terminator */ memmove(gap, gap+1, strlen(gap)); } /* get rid of trailing whitespace */ if (name[strlen(name)-1] == ' ') name[strlen(name)-1] = '\0'; } else { if (strscpy(name, features->name, sizeof(name)) < 0) { hid_warn(wacom->hdev, "String overflow while assembling device name"); } } snprintf(wacom_wac->name, sizeof(wacom_wac->name), "%s%s", name, suffix); /* Append the device type to the name */ snprintf(wacom_wac->pen_name, sizeof(wacom_wac->pen_name), "%s%s Pen", name, suffix); snprintf(wacom_wac->touch_name, sizeof(wacom_wac->touch_name), "%s%s Finger", name, suffix); snprintf(wacom_wac->pad_name, sizeof(wacom_wac->pad_name), "%s%s Pad", name, suffix); } static void wacom_release_resources(struct wacom *wacom) { struct hid_device *hdev = wacom->hdev; if (!wacom->resources) return; devres_release_group(&hdev->dev, wacom); wacom->resources = false; wacom->wacom_wac.pen_input = NULL; wacom->wacom_wac.touch_input = NULL; wacom->wacom_wac.pad_input = NULL; } static void wacom_set_shared_values(struct wacom_wac *wacom_wac) { if (wacom_wac->features.device_type & WACOM_DEVICETYPE_TOUCH) { wacom_wac->shared->type = wacom_wac->features.type; wacom_wac->shared->touch_input = wacom_wac->touch_input; } if (wacom_wac->has_mute_touch_switch) { wacom_wac->shared->has_mute_touch_switch = true; /* Hardware touch switch may be off. Wait until * we know the switch state to decide is_touch_on. * Softkey state should be initialized to "on" to * match historic default. */ if (wacom_wac->is_soft_touch_switch) wacom_wac->shared->is_touch_on = true; } if (wacom_wac->shared->has_mute_touch_switch && wacom_wac->shared->touch_input) { set_bit(EV_SW, wacom_wac->shared->touch_input->evbit); input_set_capability(wacom_wac->shared->touch_input, EV_SW, SW_MUTE_DEVICE); } } static int wacom_parse_and_register(struct wacom *wacom, bool wireless) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; struct hid_device *hdev = wacom->hdev; int error; unsigned int connect_mask = HID_CONNECT_HIDRAW; features->pktlen = wacom_compute_pktlen(hdev); if (!features->pktlen) return -ENODEV; if (!devres_open_group(&hdev->dev, wacom, GFP_KERNEL)) return -ENOMEM; error = wacom_devm_kfifo_alloc(wacom); if (error) goto fail; wacom->resources = true; error = wacom_allocate_inputs(wacom); if (error) goto fail; /* * Bamboo Pad has a generic hid handling for the Pen, and we switch it * into debug mode for the touch part. * We ignore the other interfaces. */ if (features->type == BAMBOO_PAD) { if (features->pktlen == WACOM_PKGLEN_PENABLED) { features->type = HID_GENERIC; } else if ((features->pktlen != WACOM_PKGLEN_BPAD_TOUCH) && (features->pktlen != WACOM_PKGLEN_BPAD_TOUCH_USB)) { error = -ENODEV; goto fail; } } /* set the default size in case we do not get them from hid */ wacom_set_default_phy(features); /* Retrieve the physical and logical size for touch devices */ wacom_retrieve_hid_descriptor(hdev, features); wacom_setup_device_quirks(wacom); if (features->device_type == WACOM_DEVICETYPE_NONE && features->type != WIRELESS) { error = features->type == HID_GENERIC ? -ENODEV : 0; dev_warn(&hdev->dev, "Unknown device_type for '%s'. %s.", hdev->name, error ? "Ignoring" : "Assuming pen"); if (error) goto fail; features->device_type |= WACOM_DEVICETYPE_PEN; } wacom_calculate_res(features); wacom_update_name(wacom, wireless ? " (WL)" : ""); /* pen only Bamboo neither support touch nor pad */ if ((features->type == BAMBOO_PEN) && ((features->device_type & WACOM_DEVICETYPE_TOUCH) || (features->device_type & WACOM_DEVICETYPE_PAD))) { error = -ENODEV; goto fail; } error = wacom_add_shared_data(hdev); if (error) goto fail; error = wacom_setup_inputs(wacom); if (error) goto fail; if (features->type == HID_GENERIC) connect_mask |= HID_CONNECT_DRIVER; /* Regular HID work starts now */ error = hid_hw_start(hdev, connect_mask); if (error) { hid_err(hdev, "hw start failed\n"); goto fail; } error = wacom_register_inputs(wacom); if (error) goto fail; if (wacom->wacom_wac.features.device_type & WACOM_DEVICETYPE_PAD) { error = wacom_initialize_leds(wacom); if (error) goto fail; error = wacom_initialize_remotes(wacom); if (error) goto fail; } if (!wireless) { /* Note that if query fails it is not a hard failure */ wacom_query_tablet_data(wacom); } /* touch only Bamboo doesn't support pen */ if ((features->type == BAMBOO_TOUCH) && (features->device_type & WACOM_DEVICETYPE_PEN)) { cancel_delayed_work_sync(&wacom->init_work); _wacom_query_tablet_data(wacom); error = -ENODEV; goto fail_quirks; } if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) { error = hid_hw_open(hdev); if (error) { hid_err(hdev, "hw open failed\n"); goto fail_quirks; } } wacom_set_shared_values(wacom_wac); devres_close_group(&hdev->dev, wacom); return 0; fail_quirks: hid_hw_stop(hdev); fail: wacom_release_resources(wacom); return error; } static void wacom_wireless_work(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, wireless_work); struct usb_device *usbdev = wacom->usbdev; struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct hid_device *hdev1, *hdev2; struct wacom *wacom1, *wacom2; struct wacom_wac *wacom_wac1, *wacom_wac2; int error; /* * Regardless if this is a disconnect or a new tablet, * remove any existing input and battery devices. */ wacom_destroy_battery(wacom); if (!usbdev) return; /* Stylus interface */ hdev1 = usb_get_intfdata(usbdev->config->interface[1]); wacom1 = hid_get_drvdata(hdev1); wacom_wac1 = &(wacom1->wacom_wac); wacom_release_resources(wacom1); /* Touch interface */ hdev2 = usb_get_intfdata(usbdev->config->interface[2]); wacom2 = hid_get_drvdata(hdev2); wacom_wac2 = &(wacom2->wacom_wac); wacom_release_resources(wacom2); if (wacom_wac->pid == 0) { hid_info(wacom->hdev, "wireless tablet disconnected\n"); } else { const struct hid_device_id *id = wacom_ids; hid_info(wacom->hdev, "wireless tablet connected with PID %x\n", wacom_wac->pid); while (id->bus) { if (id->vendor == USB_VENDOR_ID_WACOM && id->product == wacom_wac->pid) break; id++; } if (!id->bus) { hid_info(wacom->hdev, "ignoring unknown PID.\n"); return; } /* Stylus interface */ wacom_wac1->features = *((struct wacom_features *)id->driver_data); wacom_wac1->pid = wacom_wac->pid; hid_hw_stop(hdev1); error = wacom_parse_and_register(wacom1, true); if (error) goto fail; /* Touch interface */ if (wacom_wac1->features.touch_max || (wacom_wac1->features.type >= INTUOSHT && wacom_wac1->features.type <= BAMBOO_PT)) { wacom_wac2->features = *((struct wacom_features *)id->driver_data); wacom_wac2->pid = wacom_wac->pid; hid_hw_stop(hdev2); error = wacom_parse_and_register(wacom2, true); if (error) goto fail; } if (strscpy(wacom_wac->name, wacom_wac1->name, sizeof(wacom_wac->name)) < 0) { hid_warn(wacom->hdev, "String overflow while assembling device name"); } } return; fail: wacom_release_resources(wacom1); wacom_release_resources(wacom2); return; } static void wacom_remote_destroy_battery(struct wacom *wacom, int index) { struct wacom_remote *remote = wacom->remote; if (remote->remotes[index].battery.battery) { devres_release_group(&wacom->hdev->dev, &remote->remotes[index].battery.bat_desc); remote->remotes[index].battery.battery = NULL; remote->remotes[index].active_time = 0; } } static void wacom_remote_destroy_one(struct wacom *wacom, unsigned int index) { struct wacom_remote *remote = wacom->remote; u32 serial = remote->remotes[index].serial; int i; unsigned long flags; for (i = 0; i < WACOM_MAX_REMOTES; i++) { if (remote->remotes[i].serial == serial) { spin_lock_irqsave(&remote->remote_lock, flags); remote->remotes[i].registered = false; spin_unlock_irqrestore(&remote->remote_lock, flags); wacom_remote_destroy_battery(wacom, i); if (remote->remotes[i].group.name) devres_release_group(&wacom->hdev->dev, &remote->remotes[i]); remote->remotes[i].serial = 0; remote->remotes[i].group.name = NULL; wacom->led.groups[i].select = WACOM_STATUS_UNKNOWN; } } } static int wacom_remote_create_one(struct wacom *wacom, u32 serial, unsigned int index) { struct wacom_remote *remote = wacom->remote; struct device *dev = &wacom->hdev->dev; int error, k; /* A remote can pair more than once with an EKR, * check to make sure this serial isn't already paired. */ for (k = 0; k < WACOM_MAX_REMOTES; k++) { if (remote->remotes[k].serial == serial) break; } if (k < WACOM_MAX_REMOTES) { remote->remotes[index].serial = serial; return 0; } if (!devres_open_group(dev, &remote->remotes[index], GFP_KERNEL)) return -ENOMEM; error = wacom_remote_create_attr_group(wacom, serial, index); if (error) goto fail; remote->remotes[index].input = wacom_allocate_input(wacom); if (!remote->remotes[index].input) { error = -ENOMEM; goto fail; } remote->remotes[index].input->uniq = remote->remotes[index].group.name; remote->remotes[index].input->name = wacom->wacom_wac.pad_name; if (!remote->remotes[index].input->name) { error = -EINVAL; goto fail; } error = wacom_setup_pad_input_capabilities(remote->remotes[index].input, &wacom->wacom_wac); if (error) goto fail; remote->remotes[index].serial = serial; error = input_register_device(remote->remotes[index].input); if (error) goto fail; error = wacom_led_groups_alloc_and_register_one( &remote->remotes[index].input->dev, wacom, index, 3, true); if (error) goto fail; remote->remotes[index].registered = true; devres_close_group(dev, &remote->remotes[index]); return 0; fail: devres_release_group(dev, &remote->remotes[index]); remote->remotes[index].serial = 0; return error; } static int wacom_remote_attach_battery(struct wacom *wacom, int index) { struct wacom_remote *remote = wacom->remote; int error; if (!remote->remotes[index].registered) return 0; if (remote->remotes[index].battery.battery) return 0; if (!remote->remotes[index].active_time) return 0; if (wacom->led.groups[index].select == WACOM_STATUS_UNKNOWN) return 0; error = __wacom_initialize_battery(wacom, &wacom->remote->remotes[index].battery); if (error) return error; return 0; } static void wacom_remote_work(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, remote_work); struct wacom_remote *remote = wacom->remote; ktime_t kt = ktime_get(); struct wacom_remote_work_data remote_work_data; unsigned long flags; unsigned int count; u32 work_serial; int i; spin_lock_irqsave(&remote->remote_lock, flags); count = kfifo_out(&remote->remote_fifo, &remote_work_data, sizeof(remote_work_data)); if (count != sizeof(remote_work_data)) { hid_err(wacom->hdev, "workitem triggered without status available\n"); spin_unlock_irqrestore(&remote->remote_lock, flags); return; } if (!kfifo_is_empty(&remote->remote_fifo)) wacom_schedule_work(&wacom->wacom_wac, WACOM_WORKER_REMOTE); spin_unlock_irqrestore(&remote->remote_lock, flags); for (i = 0; i < WACOM_MAX_REMOTES; i++) { work_serial = remote_work_data.remote[i].serial; if (work_serial) { if (kt - remote->remotes[i].active_time > WACOM_REMOTE_BATTERY_TIMEOUT && remote->remotes[i].active_time != 0) wacom_remote_destroy_battery(wacom, i); if (remote->remotes[i].serial == work_serial) { wacom_remote_attach_battery(wacom, i); continue; } if (remote->remotes[i].serial) wacom_remote_destroy_one(wacom, i); wacom_remote_create_one(wacom, work_serial, i); } else if (remote->remotes[i].serial) { wacom_remote_destroy_one(wacom, i); } } } static void wacom_mode_change_work(struct work_struct *work) { struct wacom *wacom = container_of(work, struct wacom, mode_change_work); struct wacom_shared *shared = wacom->wacom_wac.shared; struct wacom *wacom1 = NULL; struct wacom *wacom2 = NULL; bool is_direct = wacom->wacom_wac.is_direct_mode; int error = 0; if (shared->pen) { wacom1 = hid_get_drvdata(shared->pen); wacom_release_resources(wacom1); hid_hw_stop(wacom1->hdev); wacom1->wacom_wac.has_mode_change = true; wacom1->wacom_wac.is_direct_mode = is_direct; } if (shared->touch) { wacom2 = hid_get_drvdata(shared->touch); wacom_release_resources(wacom2); hid_hw_stop(wacom2->hdev); wacom2->wacom_wac.has_mode_change = true; wacom2->wacom_wac.is_direct_mode = is_direct; } if (wacom1) { error = wacom_parse_and_register(wacom1, false); if (error) return; } if (wacom2) { error = wacom_parse_and_register(wacom2, false); if (error) return; } return; } static int wacom_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct wacom *wacom; struct wacom_wac *wacom_wac; struct wacom_features *features; int error; if (!id->driver_data) return -EINVAL; hdev->quirks |= HID_QUIRK_NO_INIT_REPORTS; /* hid-core sets this quirk for the boot interface */ hdev->quirks &= ~HID_QUIRK_NOGET; wacom = devm_kzalloc(&hdev->dev, sizeof(struct wacom), GFP_KERNEL); if (!wacom) return -ENOMEM; hid_set_drvdata(hdev, wacom); wacom->hdev = hdev; wacom_wac = &wacom->wacom_wac; wacom_wac->features = *((struct wacom_features *)id->driver_data); features = &wacom_wac->features; if (features->check_for_hid_type && features->hid_type != hdev->type) return -ENODEV; wacom_wac->hid_data.inputmode = -1; wacom_wac->mode_report = -1; if (hid_is_usb(hdev)) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *dev = interface_to_usbdev(intf); wacom->usbdev = dev; wacom->intf = intf; } mutex_init(&wacom->lock); INIT_DELAYED_WORK(&wacom->init_work, wacom_init_work); INIT_DELAYED_WORK(&wacom->aes_battery_work, wacom_aes_battery_handler); INIT_WORK(&wacom->wireless_work, wacom_wireless_work); INIT_WORK(&wacom->battery_work, wacom_battery_work); INIT_WORK(&wacom->remote_work, wacom_remote_work); INIT_WORK(&wacom->mode_change_work, wacom_mode_change_work); timer_setup(&wacom->idleprox_timer, &wacom_idleprox_timeout, TIMER_DEFERRABLE); /* ask for the report descriptor to be loaded by HID */ error = hid_parse(hdev); if (error) { hid_err(hdev, "parse failed\n"); return error; } if (features->type == BOOTLOADER) { hid_warn(hdev, "Using device in hidraw-only mode"); return hid_hw_start(hdev, HID_CONNECT_HIDRAW); } error = wacom_parse_and_register(wacom, false); if (error) return error; if (hdev->bus == BUS_BLUETOOTH) { error = device_create_file(&hdev->dev, &dev_attr_speed); if (error) hid_warn(hdev, "can't create sysfs speed attribute err: %d\n", error); } wacom_wac->probe_complete = true; return 0; } static void wacom_remove(struct hid_device *hdev) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) hid_hw_close(hdev); hid_hw_stop(hdev); cancel_delayed_work_sync(&wacom->init_work); cancel_delayed_work_sync(&wacom->aes_battery_work); cancel_work_sync(&wacom->wireless_work); cancel_work_sync(&wacom->battery_work); cancel_work_sync(&wacom->remote_work); cancel_work_sync(&wacom->mode_change_work); timer_delete_sync(&wacom->idleprox_timer); if (hdev->bus == BUS_BLUETOOTH) device_remove_file(&hdev->dev, &dev_attr_speed); /* make sure we don't trigger the LEDs */ wacom_led_groups_release(wacom); if (wacom->wacom_wac.features.type != REMOTE) wacom_release_resources(wacom); } #ifdef CONFIG_PM static int wacom_resume(struct hid_device *hdev) { struct wacom *wacom = hid_get_drvdata(hdev); mutex_lock(&wacom->lock); /* switch to wacom mode first */ _wacom_query_tablet_data(wacom); wacom_led_control(wacom); mutex_unlock(&wacom->lock); return 0; } static int wacom_reset_resume(struct hid_device *hdev) { return wacom_resume(hdev); } #endif /* CONFIG_PM */ static struct hid_driver wacom_driver = { .name = "wacom", .id_table = wacom_ids, .probe = wacom_probe, .remove = wacom_remove, .report = wacom_wac_report, #ifdef CONFIG_PM .resume = wacom_resume, .reset_resume = wacom_reset_resume, #endif .raw_event = wacom_raw_event, }; module_hid_driver(wacom_driver); MODULE_VERSION(DRIVER_VERSION); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
11 11 11 11 11 8 8 8 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 11 11 11 11 11 11 11 10 11 11 11 11 10 11 11 11 11 11 11 11 11 11 11 11 11 81 11 71 70 71 71 81 81 82 82 11 71 7 81 81 73 75 36 1 35 10 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/export.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) { int i; drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static const struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static const struct drm_display_mode * drm_connector_preferred_mode(struct drm_connector *connector, int width, int height) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_first_mode(struct drm_connector *connector) { return list_first_entry_or_null(&connector->modes, struct drm_display_mode, head); } static const struct drm_display_mode * drm_connector_pick_cmdline_mode(struct drm_connector *connector) { const struct drm_cmdline_mode *cmdline_mode; const struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector *connectors[], unsigned int connector_count, bool enabled[]) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] enabled? %s\n", connector->base.id, connector->name, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static void mode_replace(struct drm_device *dev, const struct drm_display_mode **dst, const struct drm_display_mode *src) { drm_mode_destroy(dev, (struct drm_display_mode *)*dst); *dst = src ? drm_mode_duplicate(dev, src) : NULL; } static void modes_destroy(struct drm_device *dev, const struct drm_display_mode *modes[], int count) { int i; for (i = 0; i < count; i++) mode_replace(dev, &modes[i], NULL); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { int count, i; bool can_clone = false; struct drm_display_mode *dmt_mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { int j; if (!enabled[i]) continue; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connectors[i])); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { drm_dbg_kms(dev, "can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode; if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) mode_replace(dev, &modes[i], mode); } if (!modes[i]) can_clone = false; } drm_mode_destroy(dev, dmt_mode); if (can_clone) { drm_dbg_kms(dev, "can clone using 1024x768\n"); return true; } fail: drm_info(dev, "kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], int idx, int h_idx, int v_idx) { int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no modes for connector tiled %d\n", connector->base.id, connector->name, i); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; drm_dbg_kms(dev, "returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; const char *mode_type; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } else { mode_type = "tiled"; mode_replace(dev, &modes[i], drm_connector_get_tiled_mode(connector)); } } if (modes[i]) drm_dbg_kms(dev, "[CONNECTOR:%d:%s] found %s mode: %s\n", connector->base.id, connector->name, mode_type, modes[i]->name); else drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no mode found\n", connector->base.id, connector->name); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *best_crtcs[], const struct drm_display_mode *modes[], int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs; struct drm_mode_set *modeset; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { struct drm_crtc *crtc = modeset->crtc; int o; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *crtcs[], const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (drm_WARN_ON(dev, count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; struct drm_encoder *encoder; struct drm_crtc *crtc; const char *mode_type; int j; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] not enabled, skipping\n", connector->base.id, connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disabled by user, skipping\n", connector->base.id, connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || drm_WARN_ON(dev, !connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] has no encoder or crtc, skipping\n", connector->base.id, connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] fallback: cloned configuration\n", connector->base.id, connector->name); goto bail; } } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* last resort: use current mode */ if (!modes[i]) { mode_type = "current"; mode_replace(dev, &modes[i], &crtc->state->mode); } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } crtcs[i] = crtc; drm_dbg_kms(dev, "[CONNECTOR::%d:%s] on [CRTC:%d:%s] using %s mode: %s\n", connector->base.id, connector->name, crtc->base.id, crtc->name, mode_type, modes[i]->name); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; if (connector->has_tile) drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { drm_dbg_kms(dev, "fallback: Not all outputs enabled\n"); drm_dbg_kms(dev, "Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: drm_dbg_kms(dev, "Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; const struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; drm_dbg_kms(dev, "\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) drm_dbg_kms(dev, "No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { modes_destroy(dev, modes, connector_count); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(dev, connectors, connector_count, modes, offsets, enabled, width, height)) drm_err(dev, "Unable to find initial modes\n"); drm_dbg_kms(dev, "picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; drm_dbg_kms(dev, "[CRTC:%d:%s] desired mode %s set (%d,%d)\n", crtc->base.id, crtc->name, mode->name, offset->x, offset->y); if (drm_WARN_ON_ONCE(dev, modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } drm_mode_destroy(dev, modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); if (!modeset->mode) { ret = -ENOMEM; break; } drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { int j; if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif
15132 106 231 230 8278 8390 7912 225 28 4434 28 146 81 12 171 3600 33 67 33 4 7 363 60 5 30 38 259 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* only for compat system calls */ extern unsigned compat_write_class[]; extern unsigned compat_read_class[]; extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_uring_entry(u8 op); extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_uring_entry(u8 op) { /* * We intentionally check audit_context() before audit_enabled as most * Linux systems (as of ~2021) rely on systemd which forces audit to * be enabled regardless of the user's audit configuration. */ if (unlikely(audit_context() && audit_enabled)) __audit_uring_entry(op); } static inline void audit_uring_exit(int success, long code) { if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline struct filename *audit_reusename(const __user char *name) { if (unlikely(!audit_dummy_context())) return __audit_reusename(name); return NULL; } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(const char *name); extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_openat2_how(struct open_how *how) { if (unlikely(!audit_dummy_context())) __audit_openat2_how(how); } static inline void audit_log_kern_module(const char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (!audit_dummy_context()) __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_uring_entry(u8 op) { } static inline void audit_uring_exit(int success, long code) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline struct filename *audit_reusename(const __user char *name) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_openat2_how(struct open_how *how) { } static inline void audit_log_kern_module(const char *name) { } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif
6 6 1 2 2 1 3 8 2 6 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_input.c: based on net/ipv4/xfrm4_input.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * YOSHIFUJI Hideaki @USAGI * IPv6 support */ #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/ipv6.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/gro.h> int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t) { XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); } EXPORT_SYMBOL(xfrm6_rcv_spi); static int xfrm6_transport_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { if (xfrm_trans_queue(skb, ip6_rcv_finish)) { kfree_skb(skb); return NET_RX_DROP; } return 0; } int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); int nhlen = -skb_network_offset(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return 1; #endif __skb_push(skb, nhlen); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { /* The full l2 header needs to be preserved so that re-injecting the packet at l2 * works correctly in the presence of vlan tags. */ skb_mac_header_rebuild_full(skb, xo->orig_mac_len); skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm6_transport_finish2); return 0; } static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct ipv6hdr *ip6h; int len; int ip6hlen = sizeof(struct ipv6hdr); __u8 *udpdata; __be32 *udpdata32; u16 encap_type; encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; /* If this is a paged skb, make sure we pull up * whatever data we need to look at. */ len = skb->len - sizeof(struct udphdr); if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) return 1; /* Now we can get the pointers */ uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else /* Must be an IKE packet.. pass it through */ return 1; break; } /* At this point we are sure that this is an ESPinUDP packet, * so we need to remove 'len' bytes from the packet (the UDP * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) return -EINVAL; /* Now we can update and verify the packet length... */ ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len); if (skb->len < ip6hlen + len) { /* packet is too small!?! */ return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ if (pull) { __skb_pull(skb, len); skb_reset_transport_header(skb); } else { skb_set_transport_header(skb, len); } /* process ESP */ return 0; } /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the * IPsec xfrm input. * Returns 0 if skb passed to xfrm or was dropped. * Returns >0 if skb should be passed to UDP. * Returns <0 if skb should be resubmitted (-ret is protocol) */ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { int ret; if (skb->protocol == htons(ETH_P_IP)) return xfrm4_udp_encap_rcv(sk, skb); ret = __xfrm6_udp_encap_rcv(sk, skb, true); if (!ret) return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, udp_sk(sk)->encap_type); if (ret < 0) { kfree_skb(skb); return 0; } return ret; } struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; int len, dlen; __u8 *udpdata; __be32 *udpdata32; if (skb->protocol == htons(ETH_P_IP)) return xfrm4_gro_udp_encap_rcv(sk, head, skb); len = skb->len - offset; dlen = offset + min(len, 8); udpdata = skb_gro_header(skb, dlen, offset); udpdata32 = (__be32 *)udpdata; if (unlikely(!udpdata)) return NULL; rcu_read_lock(); ops = rcu_dereference(inet6_offloads[IPPROTO_ESP]); if (!ops || !ops->callbacks.gro_receive) goto out; /* check if it is a keepalive or IKE packet */ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; /* set the transport header to ESP */ skb_set_transport_header(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); rcu_read_unlock(); return pp; out: rcu_read_unlock(); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, t); } EXPORT_SYMBOL(xfrm6_rcv_tnl); int xfrm6_rcv(struct sk_buff *skb) { return xfrm6_rcv_tnl(skb, NULL); } EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { struct net *net = dev_net(skb->dev); struct xfrm_state *x = NULL; struct sec_path *sp; int i = 0; sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } if (1 + sp->len == XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } for (i = 0; i < 3; i++) { xfrm_address_t *dst, *src; switch (i) { case 0: dst = daddr; src = saddr; break; case 1: /* lookup state with wild-card source address */ dst = daddr; src = (xfrm_address_t *)&in6addr_any; break; default: /* lookup state with wild-card addresses */ dst = (xfrm_address_t *)&in6addr_any; src = (xfrm_address_t *)&in6addr_any; break; } x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); if (!x) continue; if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); xfrm_state_put(x); x = NULL; continue; } spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && likely(x->km.state == XFRM_STATE_VALID) && !xfrm_state_check_expire(x)) { spin_unlock(&x->lock); if (x->type->input(x, skb) > 0) { /* found a valid state */ break; } } else spin_unlock(&x->lock); xfrm_state_put(x); x = NULL; } if (!x) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound_simple(skb, AF_INET6); goto drop; } sp->xvec[sp->len++] = x; spin_lock(&x->lock); x->curlft.bytes += skb->len; x->curlft.packets++; spin_unlock(&x->lock); return 1; drop: return -1; } EXPORT_SYMBOL(xfrm6_input_addr);
5 5 1 1 1 1 1 1 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 // SPDX-License-Identifier: GPL-2.0 /* erasure coding */ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_write.h" #include "keylist.h" #include "lru.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" #include "util.h" #include <linux/sort.h> #include <linux/string_choices.h> #ifdef __KERNEL__ #include <linux/raid/pq.h> #include <linux/raid/xor.h> static void raid5_recov(unsigned disks, unsigned failed_idx, size_t size, void **data) { unsigned i = 2, nr; BUG_ON(failed_idx >= disks); swap(data[0], data[failed_idx]); memcpy(data[0], data[1], size); while (i < disks) { nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); xor_blocks(nr, size, data[0], data + i); i += nr; } swap(data[0], data[failed_idx]); } static void raid_gen(int nd, int np, size_t size, void **v) { if (np >= 1) raid5_recov(nd + np, nd, size, v); if (np >= 2) raid6_call.gen_syndrome(nd + np, size, v); BUG_ON(np > 2); } static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) { switch (nr) { case 0: break; case 1: if (ir[0] < nd + 1) raid5_recov(nd + 1, ir[0], size, v); else raid6_call.gen_syndrome(nd + np, size, v); break; case 2: if (ir[1] < nd) { /* data+data failure. */ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); } else if (ir[0] < nd) { /* data + p/q failure */ if (ir[1] == nd) /* data + p failure */ raid6_datap_recov(nd + np, size, ir[0], v); else { /* data + q failure */ raid5_recov(nd + 1, ir[0], size, v); raid6_call.gen_syndrome(nd + np, size, v); } } else { raid_gen(nd, np, size, v); } break; default: BUG(); } } #else #include <raid/raid.h> #endif struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; int rw; u64 submit_time; struct bio bio; }; /* Stripes btree keys: */ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || bpos_gt(k.k->p, POS(0, U32_MAX)), c, stripe_pos_bad, "stripe at bad pos"); bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, stripe_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); bkey_fsck_err_on(s->csum_granularity_bits >= 64, c, stripe_csum_granularity_bad, "invalid csum granularity (%u >= 64)", s->csum_granularity_bits); ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; struct bch_stripe s = {}; memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); unsigned nr_data = s.nr_blocks - s.nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", s.algorithm, le16_to_cpu(s.sectors), nr_data, s.nr_redundant); bch2_prt_csum_type(out, s.csum_type); prt_str(out, " gran "); if (s.csum_granularity_bits < 64) prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); else prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); if (s.disk_label) { prt_str(out, " label"); bch2_disk_path_to_text(out, c, s.disk_label - 1); } for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; if ((void *) ptr >= bkey_val_end(k)) break; prt_char(out, ' '); bch2_extent_ptr_to_text(out, c, ptr); if (s.csum_type < BCH_CSUM_NR && i < nr_data && stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } /* Triggers: */ static int __mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c_stripe s, unsigned ptr_idx, bool deleting, struct bpos bucket, struct bch_alloc_v4 *a, enum btree_iter_update_trigger_flags flags) { const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; bool parity = ptr_idx >= nr_data; enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; struct printbuf buf = PRINTBUF; int ret = 0; struct bch_fs *c = trans->c; if (deleting) sectors = -sectors; if (!deleting) { if (bch2_trans_inconsistent_on(a->stripe || a->stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", bucket.inode, bucket.offset, a->gen, bch2_data_type_str(a->data_type), a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", bucket.inode, bucket.offset, a->gen, bch2_data_type_str(a->data_type), a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } } else { if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || a->stripe_redundancy != s.v->nr_redundant, trans, "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", bucket.inode, bucket.offset, a->gen, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } if (bch2_trans_inconsistent_on(parity && (a->dirty_sectors != -sectors || a->cached_sectors), trans, "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", bucket.inode, bucket.offset, a->gen, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } } if (sectors) { ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, a->gen, a->data_type, &a->dirty_sectors); if (ret) goto err; } if (!deleting) { a->stripe = s.k->p.offset; a->stripe_redundancy = s.v->nr_redundant; alloc_data_type_set(a, data_type); } else { a->stripe = 0; a->stripe_redundancy = 0; alloc_data_type_set(a, BCH_DATA_user); } err: printbuf_exit(&buf); return ret; } static int mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c_stripe s, unsigned ptr_idx, bool deleting, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) ret = bch_err_throw(c, mark_stripe); goto err; } struct bpos bucket = PTR_BUCKET_POS(ca, ptr); if (flags & BTREE_TRIGGER_transactional) { struct extent_ptr_decoded p = { .ptr = *ptr, .crc = bch2_extent_crc_unpack(s.k, NULL), }; struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, (const union bch_extent_entry *) ptr, &bp); struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: bch2_bucket_backpointer_mod(trans, s.s_c, &bp, !(flags & BTREE_TRIGGER_overwrite)); if (ret) goto err; } if (flags & BTREE_TRIGGER_gc) { struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = bch_err_throw(c, mark_stripe); goto err; } bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static int mark_stripe_buckets(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, enum btree_iter_update_trigger_flags flags) { const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(old).v : NULL; const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; for (unsigned i = 0; i < nr_blocks; i++) { if (new_s && old_s && !memcmp(&new_s->ptrs[i], &old_s->ptrs[i], sizeof(new_s->ptrs[i]))) continue; if (new_s) { int ret = mark_stripe_bucket(trans, bkey_s_c_to_stripe(new), i, false, flags); if (ret) return ret; } if (old_s) { int ret = mark_stripe_bucket(trans, bkey_s_c_to_stripe(old), i, true, flags); if (ret) return ret; } } return 0; } int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, enum btree_iter_update_trigger_flags flags) { struct bkey_s_c new = _new.s_c; struct bch_fs *c = trans->c; u64 idx = new.k->p.offset; const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(old).v : NULL; const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; if (unlikely(flags & BTREE_TRIGGER_check_repair)) return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); BUG_ON(new_s && old_s && (new_s->nr_blocks != old_s->nr_blocks || new_s->nr_redundant != old_s->nr_redundant)); if (flags & BTREE_TRIGGER_transactional) { int ret = bch2_lru_change(trans, BCH_LRU_STRIPE_FRAGMENTATION, idx, stripe_lru_pos(old_s), stripe_lru_pos(new_s)); if (ret) return ret; } if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* * If the pointers aren't changing, we don't need to do anything: */ if (new_s && old_s && new_s->nr_blocks == old_s->nr_blocks && new_s->nr_redundant == old_s->nr_redundant && !memcmp(old_s->ptrs, new_s->ptrs, new_s->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; struct gc_stripe *gc = NULL; if (flags & BTREE_TRIGGER_gc) { gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); if (!gc) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); return bch_err_throw(c, ENOMEM_mark_stripe); } /* * This will be wrong when we bring back runtime gc: we should * be unmarking the old key and then marking the new key * * Also: when we bring back runtime gc, locking */ gc->alive = true; gc->sectors = le16_to_cpu(new_s->sectors); gc->nr_blocks = new_s->nr_blocks; gc->nr_redundant = new_s->nr_redundant; for (unsigned i = 0; i < new_s->nr_blocks; i++) gc->ptrs[i] = new_s->ptrs[i]; /* * gc recalculates this field from stripe ptr * references: */ memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); } if (new_s) { s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, new); int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc); if (ret) return ret; if (gc) unsafe_memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas), "VLA"); } if (old_s) { s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, old); int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc); if (ret) return ret; } int ret = mark_stripe_buckets(trans, old, new, flags); if (ret) return ret; } return 0; } /* returns blocknr in stripe that we matched: */ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, struct bkey_s_c k, unsigned *block) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned i, nr_data = s->nr_blocks - s->nr_redundant; bkey_for_each_ptr(ptrs, ptr) for (i = 0; i < nr_data; i++) if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, le16_to_cpu(s->sectors))) { *block = i; return ptr; } return NULL; } static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr && entry->stripe_ptr.idx == idx) return true; return false; } /* Stripe bufs: */ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) { if (buf->key.k.type == KEY_TYPE_stripe) { struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { kvfree(buf->data[i]); buf->data[i] = NULL; } } } /* XXX: this is a non-mempoolified memory allocation: */ static int ec_stripe_buf_init(struct bch_fs *c, struct ec_stripe_buf *buf, unsigned offset, unsigned size) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1U << v->csum_granularity_bits; unsigned end = offset + size; unsigned i; BUG_ON(end > le16_to_cpu(v->sectors)); offset = round_down(offset, csum_granularity); end = min_t(unsigned, le16_to_cpu(v->sectors), round_up(end, csum_granularity)); buf->offset = offset; buf->size = end - offset; memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } return 0; err: ec_stripe_buf_exit(buf); return bch_err_throw(c, ENOMEM_stripe_buf); } /* Checksumming: */ static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, unsigned block, unsigned offset) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1 << v->csum_granularity_bits; unsigned end = buf->offset + buf->size; unsigned len = min(csum_granularity, end - offset); BUG_ON(offset >= end); BUG_ON(offset < buf->offset); BUG_ON(offset & (csum_granularity - 1)); BUG_ON(offset + len != le16_to_cpu(v->sectors) && (len & (csum_granularity - 1))); return bch2_checksum(NULL, v->csum_type, null_nonce(), buf->data[block] + ((offset - buf->offset) << 9), len << 9); } static void ec_generate_checksums(struct ec_stripe_buf *buf) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned i, j, csums_per_device = stripe_csums_per_device(v); if (!v->csum_type) return; BUG_ON(buf->offset); BUG_ON(buf->size != le16_to_cpu(v->sectors)); for (i = 0; i < v->nr_blocks; i++) for (j = 0; j < csums_per_device; j++) stripe_csum_set(v, i, j, ec_block_checksum(buf, i, j << v->csum_granularity_bits)); } static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1 << v->csum_granularity_bits; unsigned i; if (!v->csum_type) return; for (i = 0; i < v->nr_blocks; i++) { unsigned offset = buf->offset; unsigned end = buf->offset + buf->size; if (!test_bit(i, buf->valid)) continue; while (offset < end) { unsigned j = offset >> v->csum_granularity_bits; unsigned len = min(csum_granularity, end - offset); struct bch_csum want = stripe_csum_get(v, i, j); struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); if (ca) { struct printbuf err = PRINTBUF; prt_str(&err, "stripe "); bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); printbuf_exit(&err); bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); } clear_bit(i, buf->valid); break; } offset += len; } } } /* Erasure coding: */ static void ec_generate_ec(struct ec_stripe_buf *buf) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = le16_to_cpu(v->sectors) << 9; raid_gen(nr_data, v->nr_redundant, bytes, buf->data); } static unsigned ec_nr_failed(struct ec_stripe_buf *buf) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); } static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = buf->size << 9; if (ec_nr_failed(buf) > v->nr_redundant) { bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); return -1; } for (i = 0; i < nr_data; i++) if (!test_bit(i, buf->valid)) failed[nr_failed++] = i; raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); return 0; } /* IO: */ static void ec_block_endio(struct bio *bio) { struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; int rw = ec_bio->rw; unsigned ref = rw == READ ? BCH_DEV_READ_REF_ec_block : BCH_DEV_WRITE_REF_ec_block; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); if (bio->bi_status) { bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); } int stale = dev_ptr_stale(ca, ptr); if (stale) { bch_err_ratelimited(ca->fs, "error %s stripe: stale/invalid pointer (%i) after io", bio_data_dir(bio) == READ ? "reading from" : "writing to", stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } bio_put(&ec_bio->bio); enumerated_ref_put(&ca->io_ref[rw], ref); closure_put(cl); } static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, blk_opf_t opf, unsigned idx, struct closure *cl) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); unsigned ref = rw == READ ? BCH_DEV_READ_REF_ec_block : BCH_DEV_WRITE_REF_ec_block; struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); if (!ca) { clear_bit(idx, buf->valid); return; } int stale = dev_ptr_stale(ca, ptr); if (stale) { bch_err_ratelimited(c, "error %s stripe: stale pointer (%i)", rw == READ ? "reading from" : "writing to", stale); clear_bit(idx, buf->valid); return; } this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, DIV_ROUND_UP(bytes, PAGE_SIZE)); unsigned b = min_t(size_t, bytes - offset, nr_iovecs << PAGE_SHIFT); struct ec_bio *ec_bio; ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, nr_iovecs, opf, GFP_KERNEL, &c->ec_bioset), struct ec_bio, bio); ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; ec_bio->rw = rw; ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; ec_bio->bio.bi_private = cl; bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); enumerated_ref_get(&ca->io_ref[rw], ref); submit_bio(&ec_bio->bio); offset += b; } enumerated_ref_put(&ca->io_ref[rw], ref); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, struct ec_stripe_buf *stripe) { struct btree_iter iter; struct bkey_s_c k; int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_slots); ret = bkey_err(k); if (ret) goto err; if (k.k->type != KEY_TYPE_stripe) { ret = -ENOENT; goto err; } bkey_reassemble(&stripe->key, k); err: bch2_trans_iter_exit(trans, &iter); return ret; } /* recovery read path: */ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct ec_stripe_buf *buf = NULL; struct closure cl; struct bch_stripe *v; unsigned i, offset; const char *msg = NULL; struct printbuf msgbuf = PRINTBUF; int ret = 0; closure_init_stack(&cl); BUG_ON(!rbio->pick.has_ec); buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) return bch_err_throw(c, ENOMEM_ec_read_extent); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { msg = "stripe not found"; goto err; } v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { msg = "pointer doesn't match stripe"; goto err; } offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { msg = "read is bigger than stripe"; goto err; } ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); if (ret) { msg = "-ENOMEM"; goto err; } for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { msg = "unable to read enough blocks"; goto err; } ec_validate_checksums(c, buf); ret = ec_do_recov(c, buf); if (ret) goto err; memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); out: ec_stripe_buf_exit(buf); kfree(buf); return ret; err: bch2_bkey_val_to_text(&msgbuf, c, orig_k); bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); printbuf_exit(&msgbuf); ret = bch_err_throw(c, stripe_reconstruct); goto out; } /* stripe bucket accounting: */ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); return 0; } static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { return allocate_dropping_locks_errcode(trans, __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); } /* * Hash table of open stripes: * Stripes that are being created or modified are kept in a hash table, so that * stripe deletion can skip them. */ static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); struct ec_stripe_new *s; hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) if (s->idx == idx) return true; return false; } static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) { bool ret = false; spin_lock(&c->ec_stripes_new_lock); ret = __bch2_stripe_is_open(c, idx); spin_unlock(&c->ec_stripes_new_lock); return ret; } static bool bch2_try_open_stripe(struct bch_fs *c, struct ec_stripe_new *s, u64 idx) { bool ret; spin_lock(&c->ec_stripes_new_lock); ret = !__bch2_stripe_is_open(c, idx); if (ret) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); s->idx = idx; hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); } spin_unlock(&c->ec_stripes_new_lock); return ret; } static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) { BUG_ON(!s->idx); spin_lock(&c->ec_stripes_new_lock); hlist_del_init(&s->hash); spin_unlock(&c->ec_stripes_new_lock); s->idx = 0; } /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_intent); int ret = bkey_err(k); if (ret) goto err; /* * We expect write buffer races here * Important: check stripe_is_open with stripe key locked: */ if (k.k->type == KEY_TYPE_stripe && !bch2_stripe_is_open(trans->c, idx) && stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } /* * XXX * can we kill this and delete stripes from the trigger? */ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); bch2_trans_run(c, bch2_btree_write_buffer_tryflush(trans) ?: for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), 0, lru_k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ec_stripe_delete(trans, lru_k.k->p.offset); }))); enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } void bch2_do_stripe_deletes(struct bch_fs *c) { if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, struct bkey_i_stripe *old, struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; bool create = !old; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, new->k.p, BTREE_ITER_intent); int ret = bkey_err(k); if (ret) goto err; if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), c, "error %s stripe: got existing key type %s", create ? "creating" : "updating", bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; BUG_ON(old->v.nr_blocks != new->v.nr_blocks); BUG_ON(old->v.nr_blocks != v->nr_blocks); for (unsigned i = 0; i < new->v.nr_blocks; i++) { unsigned sectors = stripe_blockcount_get(v, i); if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "stripe changed nonempty block %u", i); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); ret = -EINVAL; goto err; } /* * If the stripe ptr changed underneath us, it must have * been dev_remove_stripes() -> * invalidate_stripe_to_dev() */ if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; } stripe_blockcount_set(&new->v, i, sectors); } } ret = bch2_trans_update(trans, &iter, &new->k_i, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; struct bch_extent_ptr *ec_ptr = NULL; struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; if (bp.v->level) { struct printbuf buf = PRINTBUF; struct btree_iter node_iter; struct btree *b; b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); bch2_trans_iter_exit(trans, &node_iter); if (!b) return 0; prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); bch2_bkey_val_to_text(&buf, c, bp.s_c); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); return bch_err_throw(c, erasure_coding_found_btree_node); } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); ret = bkey_err(k); if (ret) return ret; if (!k.k) { /* * extent no longer exists - we could flush the btree * write buffer and retry to verify, but no need: */ return 0; } if (extent_has_stripe_ptr(k, s->key.k.p.offset)) goto out; ptr_c = bkey_matches_stripe(v, k, &block); /* * It doesn't generally make sense to erasure code cached ptrs: * XXX: should we be incrementing a counter? */ if (!ptr_c || ptr_c->cached) goto out; dev = v->ptrs[block].dev; n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); ret = PTR_ERR_OR_ZERO(n); if (ret) goto out; bkey_reassemble(n, k); bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); stripe_ptr = (struct bch_extent_stripe_ptr) { .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, .block = block, .redundancy = v->nr_redundant, .idx = s->key.k.p.offset, }; __extent_entry_insert(n, (union bch_extent_entry *) ec_ptr, (union bch_extent_entry *) &stripe_ptr); ret = bch2_trans_update(trans, &iter, n, 0); out: bch2_trans_iter_exit(trans, &iter); return ret; } static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, unsigned block) { struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_extent_ptr ptr = v->ptrs[block]; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) return bch_err_throw(c, ENOENT_dev_not_found); struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket_pos), bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, ({ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) break; if (bp_k.k->type != KEY_TYPE_backpointer) continue; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); if (bp.v->btree_id == BTREE_ID_stripes) continue; ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); bch2_dev_put(ca); return ret; } static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; unsigned nr_data = v->nr_blocks - v->nr_redundant; int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); return ret; } static void zero_out_rest_of_ec_bucket(struct bch_fs *c, struct ec_stripe_new *s, unsigned block, struct open_bucket *ob) { struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { s->err = bch_err_throw(c, erofs_no_writes); return; } unsigned offset = ca->mi.bucket_size - ob->sectors_free; memset(s->new_stripe.data[block] + (offset << 9), 0, ob->sectors_free << 9); int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, ob->bucket * ca->mi.bucket_size + offset, ob->sectors_free, GFP_KERNEL, 0); enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); if (ret) s->err = ret; } void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) { if (s->idx) bch2_stripe_close(c, s); kfree(s); } /* * data buckets of new stripe all written: create the stripe */ static void ec_stripe_create(struct ec_stripe_new *s) { struct bch_fs *c = s->c; struct open_bucket *ob; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret; BUG_ON(s->h->s == s); closure_sync(&s->iodone); if (!s->err) { for (i = 0; i < nr_data; i++) if (s->blocks[i]) { ob = c->open_buckets + s->blocks[i]; if (ob->sectors_free) zero_out_rest_of_ec_bucket(c, s, i, ob); } } if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); ret = s->err; goto err; } if (s->have_existing_stripe) { ec_validate_checksums(c, &s->existing_stripe); if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); ret = bch_err_throw(c, ec_block_read); goto err; } for (i = 0; i < nr_data; i++) if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) swap(s->new_stripe.data[i], s->existing_stripe.data[i]); ec_stripe_buf_exit(&s->existing_stripe); } BUG_ON(!s->allocated); BUG_ON(!s->idx); ec_generate_ec(&s->new_stripe); ec_generate_checksums(&s->new_stripe); /* write p/q: */ for (i = nr_data; i < v->nr_blocks; i++) ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); closure_sync(&s->iodone); if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); ret = bch_err_throw(c, ec_block_write); goto err; } ret = bch2_trans_commit_do(c, &s->res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, s->have_existing_stripe ? bkey_i_to_stripe(&s->existing_stripe.key) : NULL, bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; } ret = ec_stripe_update_extents(c, &s->new_stripe); bch_err_msg(c, ret, "error updating extents"); if (ret) goto err; err: trace_stripe_create(c, s->idx, ret); bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) if (s->blocks[i]) { ob = c->open_buckets + s->blocks[i]; if (i < nr_data) { ob->ec = NULL; __bch2_open_bucket_put(c, ob); } else { bch2_open_bucket_put(c, ob); } } mutex_lock(&c->ec_stripe_new_lock); list_del(&s->list); mutex_unlock(&c->ec_stripe_new_lock); wake_up(&c->ec_stripe_new_wait); ec_stripe_buf_exit(&s->existing_stripe); ec_stripe_buf_exit(&s->new_stripe); closure_debug_destroy(&s->iodone); ec_stripe_new_put(c, s, STRIPE_REF_stripe); } static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) { struct ec_stripe_new *s; mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) if (!atomic_read(&s->ref[STRIPE_REF_io])) goto out; s = NULL; out: mutex_unlock(&c->ec_stripe_new_lock); return s; } static void ec_stripe_create_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_create_work); struct ec_stripe_new *s; while ((s = get_pending_stripe(c))) ec_stripe_create(s); enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } void bch2_ec_do_stripe_creates(struct bch_fs *c) { enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s = h->s; lockdep_assert_held(&h->lock); BUG_ON(!s->allocated && !s->err); h->s = NULL; s->pending = true; mutex_lock(&c->ec_stripe_new_lock); list_add(&s->list, &c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); ec_stripe_new_put(c, s, STRIPE_REF_io); } static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) { h->s->err = err; ec_stripe_new_set_pending(c, h); } void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); struct bch_dev *ca = ob_dev(c, ob); unsigned offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } static int unsigned_cmp(const void *_l, const void *_r) { unsigned l = *((const unsigned *) _l); unsigned r = *((const unsigned *) _r); return cmp_int(l, r); } /* pick most common bucket size: */ static unsigned pick_blocksize(struct bch_fs *c, struct bch_devs_mask *devs) { unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; struct { unsigned nr, size; } cur = { 0, 0 }, best = { 0, 0 }; for_each_member_device_rcu(c, ca, devs) sizes[nr++] = ca->mi.bucket_size; sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); for (unsigned i = 0; i < nr; i++) { if (sizes[i] != cur.size) { if (cur.nr > best.nr) best = cur; cur.nr = 0; cur.size = sizes[i]; } cur.nr++; } if (cur.nr > best.nr) best = cur; return best.size; } static bool may_create_new_stripe(struct bch_fs *c) { return false; } static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i *k, unsigned nr_data, unsigned nr_parity, unsigned stripe_size, unsigned disk_label) { struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; s->v.sectors = cpu_to_le16(stripe_size); s->v.algorithm = 0; s->v.nr_blocks = nr_data + nr_parity; s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; s->v.disk_label = disk_label; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { BUG_ON(1 << s->v.csum_granularity_bits >= le16_to_cpu(s->v.sectors) || s->v.csum_granularity_bits == U8_MAX); s->v.csum_granularity_bits++; } set_bkey_val_u64s(&s->k, u64s); } static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; lockdep_assert_held(&h->lock); s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) return NULL; mutex_init(&s->lock); closure_init(&s->iodone, NULL); atomic_set(&s->ref[STRIPE_REF_stripe], 1); atomic_set(&s->ref[STRIPE_REF_io], 1); s->c = c; s->h = h; s->nr_data = min_t(unsigned, h->nr_active_devs, BCH_BKEY_PTRS_MAX) - h->redundancy; s->nr_parity = h->redundancy; ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize, h->disk_label); return s; } static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { struct bch_devs_mask devs = h->devs; unsigned nr_devs, nr_devs_with_durability; scoped_guard(rcu) { h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label ? group_to_target(h->disk_label - 1) : 0); nr_devs = dev_mask_nr(&h->devs); for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) __clear_bit(ca->dev_idx, h->devs.d); nr_devs_with_durability = dev_mask_nr(&h->devs); h->blocksize = pick_blocksize(c, &h->devs); h->nr_active_devs = 0; for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; } /* * If we only have redundancy + 1 devices, we're better off with just * replication: */ h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; if (h->insufficient_devs) { const char *err; if (nr_devs < h->redundancy + 2) err = NULL; else if (nr_devs_with_durability < h->redundancy + 2) err = "cannot use durability=0 devices"; else err = "mismatched bucket sizes"; if (err) bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", h->nr_active_devs, h->redundancy + 2, err); } struct bch_devs_mask devs_leaving; bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) ec_stripe_new_cancel(c, h, -EINTR); h->rw_devs_change_count = c->rw_devs_change_count; } static struct ec_stripe_head * ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) { struct ec_stripe_head *h; h = kzalloc(sizeof(*h), GFP_KERNEL); if (!h) return NULL; mutex_init(&h->lock); BUG_ON(!mutex_trylock(&h->lock)); h->disk_label = disk_label; h->algo = algo; h->redundancy = redundancy; h->watermark = watermark; list_add(&h->list, &c->ec_stripe_head_list); return h; } void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) { if (h->s && h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->nr_data) == h->s->nr_data) ec_stripe_new_set_pending(c, h); mutex_unlock(&h->lock); } static struct ec_stripe_head * __bch2_ec_stripe_head_get(struct btree_trans *trans, unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) { struct bch_fs *c = trans->c; struct ec_stripe_head *h; int ret; if (!redundancy) return NULL; ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); if (ret) return ERR_PTR(ret); if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); goto err; } list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->disk_label == disk_label && h->algo == algo && h->redundancy == redundancy && h->watermark == watermark) { ret = bch2_trans_mutex_lock(trans, &h->lock); if (ret) { h = ERR_PTR(ret); goto err; } goto found; } h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); if (!h) { h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); goto err; } found: if (h->rw_devs_change_count != c->rw_devs_change_count) ec_stripe_head_devs_update(c, h); if (h->insufficient_devs) { mutex_unlock(&h->lock); h = NULL; } err: mutex_unlock(&c->ec_stripe_head_lock); return h; } static int new_stripe_alloc_buckets(struct btree_trans *trans, struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, struct closure *cl) { struct bch_fs *c = trans->c; struct open_bucket *ob; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; int ret = 0; req->scratch_data_type = req->data_type; req->scratch_ptrs = req->ptrs; req->scratch_nr_replicas = req->nr_replicas; req->scratch_nr_effective = req->nr_effective; req->scratch_have_cache = req->have_cache; req->scratch_devs_may_alloc = req->devs_may_alloc; req->devs_may_alloc = h->devs; req->have_cache = true; BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* * Note: we don't yet repair invalid blocks (failed/removed * devices) when reusing stripes - we still need a codepath to * walk backpointers and update all extents that point to that * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; else nr_have_parity++; } BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { req->nr_replicas = s->nr_parity; req->nr_effective = nr_have_parity; req->data_type = BCH_DATA_parity; ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) goto err; } req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { req->nr_replicas = s->nr_data; req->nr_effective = nr_have_data; req->data_type = BCH_DATA_user; ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) goto err; } err: req->data_type = req->scratch_data_type; req->ptrs = req->scratch_ptrs; req->nr_replicas = req->scratch_nr_replicas; req->nr_effective = req->scratch_nr_effective; req->have_cache = req->scratch_have_cache; req->devs_may_alloc = req->scratch_devs_may_alloc; return ret; } static int __get_existing_stripe(struct btree_trans *trans, struct ec_stripe_head *head, struct ec_stripe_buf *stripe, u64 idx) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), 0); int ret = bkey_err(k); if (ret) goto err; /* We expect write buffer races here */ if (k.k->type != KEY_TYPE_stripe) goto out; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); if (stripe_lru_pos(s.v) <= 1) goto out; if (s.v->disk_label == head->disk_label && s.v->algorithm == head->algo && s.v->nr_redundant == head->redundancy && le16_to_cpu(s.v->sectors) == head->blocksize && bch2_try_open_stripe(c, head->s, idx)) { bkey_reassemble(&stripe->key, k); ret = 1; } out: bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; } static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) { struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; unsigned i; BUG_ON(existing_v->nr_redundant != s->nr_parity); s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { bch2_stripe_close(c, s); return ret; } BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); /* * Free buckets we initially allocated - they might conflict with * blocks from the stripe we're reusing: */ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); s->blocks[i] = 0; } memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); for (unsigned i = 0; i < existing_v->nr_blocks; i++) { if (stripe_blockcount_get(existing_v, i)) { __set_bit(i, s->blocks_gotten); __set_bit(i, s->blocks_allocated); } ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); } bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); s->have_existing_stripe = true; return 0; } static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, struct ec_stripe_new *s) { struct bch_fs *c = trans->c; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ if (may_create_new_stripe(c)) return -1; struct btree_iter lru_iter; struct bkey_s_c lru_k; int ret = 0; for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), 0, lru_k, ret) { ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); if (ret) break; } bch2_trans_iter_exit(trans, &lru_iter); if (!ret) ret = bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) return ret; return init_new_stripe_from_existing(c, s); } static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, struct ec_stripe_new *s) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; struct bpos min_pos = POS(0, 1); struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; if (!s->res.sectors) { ret = bch2_disk_reservation_get(c, &s->res, h->blocksize, s->nr_parity, BCH_DISK_RESERVATION_NOFAIL); if (ret) return ret; } /* * Allocate stripe slot * XXX: we're going to need a bitrange btree of free stripes */ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; bch2_btree_iter_set_pos(trans, &iter, start_pos); continue; } ret = bch_err_throw(c, ENOSPC_stripe_create); break; } if (bkey_deleted(k.k) && bch2_try_open_stripe(c, s, k.k->p.offset)) break; } c->ec_stripe_hint = iter.pos.offset; if (ret) goto err; ret = ec_stripe_mem_alloc(trans, &iter); if (ret) { bch2_stripe_close(c, s); goto err; } s->new_stripe.key.k.p = iter.pos; out: bch2_trans_iter_exit(trans, &iter); return ret; err: bch2_disk_reservation_put(c, &s->res); goto out; } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct alloc_request *req, unsigned algo, struct closure *cl) { struct bch_fs *c = trans->c; unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; struct target t = target_decode(req->target); bool waiting = false; int ret; if (t.type == TARGET_GROUP) { if (t.group > U8_MAX) { bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); return NULL; } disk_label = t.group + 1; /* 0 == no label */ } struct ec_stripe_head *h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; if (!h->s) { h->s = ec_new_stripe_alloc(c, h); if (!h->s) { ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); bch_err(c, "failed to allocate new stripe"); goto err; } h->nr_created++; } struct ec_stripe_new *s = h->s; if (s->allocated) goto allocated; if (s->have_existing_stripe) goto alloc_existing; /* First, try to allocate a full stripe: */ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; swap(req->watermark, saved_watermark); ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); swap(req->watermark, saved_watermark); if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) goto err; /* * Not enough buckets available for a full stripe: we must reuse an * existing stripe: */ while (1) { ret = __bch2_ec_stripe_head_reuse(trans, h, s); if (!ret) break; if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; if (req->watermark == BCH_WATERMARK_copygc) { ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; goto allocate_buf; } /* XXX freelist_wait? */ closure_wait(&c->freelist_wait, cl); waiting = true; } if (waiting) closure_wake_up(&c->freelist_wait); alloc_existing: /* * Retry allocating buckets, with the watermark for this * particular write: */ ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; allocate_buf: ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); if (ret) goto err; s->allocated = true; allocated: BUG_ON(!s->idx); BUG_ON(!s->new_stripe.data[0]); BUG_ON(trans->restarted); return h; err: bch2_ec_stripe_head_put(c, h); return ERR_PTR(ret); } /* device removal */ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, unsigned flags) { if (k.k->type != KEY_TYPE_stripe) return 0; struct bch_fs *c = trans->c; struct bkey_i_stripe *s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; struct disk_accounting_pos acc; s64 sectors = 0; for (unsigned i = 0; i < s->v.nr_blocks; i++) sectors -= stripe_blockcount_get(&s->v, i); memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false); if (ret) return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); /* XXX: how much redundancy do we still have? check degraded flags */ unsigned nr_good = 0; scoped_guard(rcu) bkey_for_each_ptr(ptrs, ptr) { if (ptr->dev == dev_idx) ptr->dev = BCH_SB_MEMBER_INVALID; struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; } if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) return bch_err_throw(c, remove_would_lose_data); unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) return bch_err_throw(c, remove_would_lose_data); sectors = -sectors; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false); } static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, unsigned flags) { struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); if (!a->stripe) return 0; if (a->stripe_sectors) { struct bch_fs *c = trans->c; bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); return bch_err_throw(c, invalidate_stripe_to_dev); } struct btree_iter iter; struct bkey_s_c_stripe s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), BTREE_ITER_slots, stripe); int ret = bkey_err(s); if (ret) return ret; ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { int ret = bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); bch_err_fn(c, ret); return ret; } /* startup/shutdown */ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; struct open_bucket *ob; unsigned i; mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { mutex_lock(&h->lock); if (!h->s) goto unlock; if (!ca) goto found; for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { if (!h->s->blocks[i]) continue; ob = c->open_buckets + h->s->blocks[i]; if (ob->dev == ca->dev_idx) goto found; } goto unlock; found: ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); unlock: mutex_unlock(&h->lock); } mutex_unlock(&c->ec_stripe_head_lock); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) { __bch2_ec_stop(c, ca); } void bch2_fs_ec_stop(struct bch_fs *c) { __bch2_ec_stop(c, NULL); } static bool bch2_fs_ec_flush_done(struct bch_fs *c) { sched_annotate_sleep(); mutex_lock(&c->ec_stripe_new_lock); bool ret = list_empty(&c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); return ret; } void bch2_fs_ec_flush(struct bch_fs *c) { wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); } int bch2_stripes_read(struct bch_fs *c) { return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct ec_stripe_new *s) { prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", s->idx, s->nr_data, s->nr_parity, bitmap_weight(s->blocks_allocated, s->nr_data), atomic_read(&s->ref[STRIPE_REF_io]), atomic_read(&s->ref[STRIPE_REF_stripe]), bch2_watermarks[s->h->watermark]); struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i; for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) prt_printf(out, " %u", s->blocks[i]); prt_newline(out); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); prt_newline(out); } void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) { struct ec_stripe_head *h; struct ec_stripe_new *s; mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", h->disk_label, h->algo, h->redundancy, bch2_watermarks[h->watermark], h->nr_created); if (h->s) bch2_new_stripe_to_text(out, c, h->s); } mutex_unlock(&c->ec_stripe_head_lock); prt_printf(out, "in flight:\n"); mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) bch2_new_stripe_to_text(out, c, s); mutex_unlock(&c->ec_stripe_new_lock); } void bch2_fs_ec_exit(struct bch_fs *c) { struct ec_stripe_head *h; unsigned i; while (1) { mutex_lock(&c->ec_stripe_head_lock); h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); mutex_unlock(&c->ec_stripe_head_lock); if (!h) break; if (h->s) { for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) BUG_ON(h->s->blocks[i]); kfree(h->s); } kfree(h); } BUG_ON(!list_empty(&c->ec_stripe_new_list)); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); INIT_LIST_HEAD(&c->ec_stripe_new_list); mutex_init(&c->ec_stripe_new_lock); init_waitqueue_head(&c->ec_stripe_new_wait); INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); } int bch2_fs_ec_init(struct bch_fs *c) { return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, struct bkey_s_c k, struct bkey_buf *last_flushed) { if (k.k->type != KEY_TYPE_stripe) return 0; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); u64 lru_idx = stripe_lru_pos(s.v); if (lru_idx) { int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, k.k->p.offset, lru_idx, k, last_flushed); if (ret) return ret; } return 0; } int bch2_check_stripe_to_lru_refs(struct bch_fs *c) { struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; }
3 3 3 4 4 7 1 1 56 51 11 4 51 7 4 51 7 11 51 3 2 2 2 2 3 3 3 55 9 6 1 45 7 56 55 56 56 55 48 6 6 6 6 8 1 1 3 2 1 2 2 2 2 2 1 1 2 8 3 5 2 8 8 1 2 18 5 9 2 1 1 17 3 14 10 4 1 2 8 1 5 1 8 3 3 7 4 3 4 1 2 3 3 3 3 3 2 2 4 4 3 3 4 4 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #endif #include "br_private.h" static bool br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { *timer = br_timer_value(&pmctx->ip4_mc_router_timer); return !hlist_unhashed(&pmctx->ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) *timer = br_timer_value(&pmctx->ip6_mc_router_timer); return !hlist_unhashed(&pmctx->ip6_rlist); #else *timer = 0; return false; #endif } static size_t __br_rports_one_size(void) { return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } size_t br_rports_size(const struct net_bridge_mcast *brmctx) { struct net_bridge_mcast_port *pmctx; size_t size = nla_total_size(0); /* MDBA_ROUTER */ rcu_read_lock(); hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) size += __br_rports_one_size(); #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) size += __br_rports_one_size(); #endif rcu_read_unlock(); return size; } int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx) { u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { struct net_bridge_mcast_port *pmctx; if (vid) { struct net_bridge_vlan *v; v = br_vlan_find(nbp_vlan_group(p), vid); if (!v) continue; pmctx = &v->port_mcast_ctx; } else { pmctx = &p->multicast_ctx; } have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, ip6_timer)) || (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); return 0; fail: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) { e->state = flags & MDB_PG_FLAGS_PERMANENT; e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; switch (ip->proto) { case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif default: ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } static int __mdb_fill_srcs(struct sk_buff *skb, struct net_bridge_port_group *p) { struct net_bridge_group_src *ent; struct nlattr *nest, *nest_ent; if (hlist_empty(&p->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #endif default: nla_nest_cancel(skb, nest_ent); continue; } if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, br_timer_value(&ent->timer))) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; u8 flags = 0; int ifindex; memset(&e, 0, sizeof(e)); if (p) { ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { ifindex = mp->br->dev->ifindex; mtimer = &mp->timer; } __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) { e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) } else if (mp->addr.proto == htons(ETH_P_IPV6)) { e.addr.u.ip6 = mp->addr.dst.ip6; #endif } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.state = MDB_PERMANENT; } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest_ent) return -EMSGSIZE; if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) goto nest_err; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) goto nest_err; break; } break; #endif default: ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) goto nest_err; if (dump_srcs_mode && (__mdb_fill_srcs(skb, p) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) goto nest_err; } nla_nest_end(skb, nest_ent); return 0; nest_err: nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; if (idx < s_idx) goto skip; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; } if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); break; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_end(skb, nest2); goto out; } skip_pg: pidx++; } pidx = 0; s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; } out: cb->args[1] = idx; cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; rcu_read_lock(); err = br_mdb_fill_info(skb, cb, dev); if (err) goto out; err = br_rports_fill_info(skb, &br->multicast_ctx); if (err) goto out; out: rcu_read_unlock(); nlmsg_end(skb, nlh); return err; } static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; size_t nlmsg_size, addr_size = 0; /* MDBA_MDB_ENTRY_INFO */ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) + /* MDBA_MDB_EATTR_TIMER */ nla_total_size(sizeof(u32)); if (!pg) goto out; /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); switch (pg->key.addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; #endif } /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST nested attr */ if (!hlist_empty(&pg->src_list)) nlmsg_size += nla_total_size(0); hlist_for_each_entry(ent, &pg->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY nested attr + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER */ nlmsg_size += nla_total_size(0) + nla_total_size(addr_size) + nla_total_size(sizeof(u32)); } out: return nlmsg_size; } static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0) + /* Port group entry */ rtnl_mdb_nlmsg_pg_size(pg); } static void __br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (notify_switchdev) br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { __br_mdb_notify(dev, mp, pg, type, true); } void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg) { __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto end; if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { nla_nest_cancel(skb, port_nest); goto end; } if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { nla_nest_cancel(skb, port_nest); goto end; } nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(__u32)) + nla_total_size(sizeof(u16)); } void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; u16 vid; ifindex = pmctx ? pmctx->port->dev->ifindex : 0; vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static const struct nla_policy br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static const struct nla_policy br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), }; static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { struct net_bridge_mcast *brmctx = NULL; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { brmctx = &br->multicast_ctx; goto out; } if (!entry->vid) { NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); goto out; } v = br_vlan_find(br_vlan_group(br), entry->vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); goto out; } if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); goto out; } brmctx = &v->br_mcast_ctx; out: return brmctx; } static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags) { unsigned long now = jiffies; pg->flags = flags; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); return 0; } static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_sg(cfg, mp, p, brmctx, flags); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, MCAST_INCLUDE, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for * proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { struct net_bridge_mdb_entry *star_mp; struct br_ip star_group; star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(cfg->br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); } return 0; } static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, struct br_ip *src_ip, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *sgmp; struct br_mdb_config sg_cfg; struct br_ip sg_ip; u8 flags = 0; sg_ip = cfg->group; sg_ip.src = src_ip->src; sgmp = br_multicast_new_group(cfg->br, &sg_ip); if (IS_ERR(sgmp)) { NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); return PTR_ERR(sgmp); } if (cfg->entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (cfg->filter_mode == MCAST_EXCLUDE) flags |= MDB_PG_FLAGS_BLOCKED; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.br = cfg->br; sg_cfg.p = cfg->p; sg_cfg.entry = cfg->entry; sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; sg_cfg.rt_protocol = cfg->rt_protocol; sg_cfg.nlflags = cfg->nlflags; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } static int br_mdb_add_group_src(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct br_mdb_src_entry *src, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; unsigned long now = jiffies; int err; ent = br_multicast_find_group_src(pg, &src->addr); if (!ent) { ent = br_multicast_new_group_src(pg, &src->addr); if (!ent) { NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); return -ENOSPC; } } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } if (cfg->filter_mode == MCAST_INCLUDE && cfg->entry->state == MDB_TEMPORARY) mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); else timer_delete(&ent->timer); /* Install a (S, G) forwarding entry for the source. */ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); if (err) goto err_del_sg; ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; return 0; err_del_sg: __br_multicast_del_group_src(ent); return err; } static void br_mdb_del_group_src(struct net_bridge_port_group *pg, struct br_mdb_src_entry *src) { struct net_bridge_group_src *ent; ent = br_multicast_find_group_src(pg, &src->addr); if (WARN_ON_ONCE(!ent)) return; br_multicast_del_group_src(ent, false); } static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { int i, err; for (i = 0; i < cfg->num_src_entries; i++) { err = br_mdb_add_group_src(cfg, pg, brmctx, &cfg->src_entries[i], extack); if (err) goto err_del_group_srcs; } return 0; err_del_group_srcs: for (i--; i >= 0; i--) br_mdb_del_group_src(pg, &cfg->src_entries[i]); return err; } static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int err; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); if (err) goto err_clear_delete; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_DELETE) br_multicast_del_group_src(ent, false); } return 0; err_clear_delete: hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_DELETE; return err; } static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { unsigned long now = jiffies; int err; err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); if (err) return err; pg->flags = flags; pg->filter_mode = cfg->filter_mode; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) br_multicast_star_g_handle_mode(pg, cfg->filter_mode); return 0; } static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; int err; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, flags, extack); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, cfg->filter_mode, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); if (err) goto err_del_port_group; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* If we are adding a new EXCLUDE port group (*, G), it needs to be * also added to all (S, G) entries for proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && cfg->filter_mode == MCAST_EXCLUDE) br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); return 0; err_del_port_group: br_multicast_del_port_group(p); return err; } static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; unsigned char flags = 0; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; mp = br_multicast_new_group(br, &group); if (IS_ERR(mp)) return PTR_ERR(mp); /* host join */ if (!port) { if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (br_multicast_is_star_g(&group)) return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); else return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { int ret; spin_lock_bh(&cfg->br->multicast_lock); ret = br_mdb_add_group(cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; } static int br_mdb_config_src_entry_init(struct nlattr *src_entry, struct br_mdb_src_entry *src, __be16 proto, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; int err; err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, br_mdbe_src_list_entry_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) return -EINVAL; if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) return -EINVAL; src->addr.proto = proto; nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], nla_len(tb[MDBE_SRCATTR_ADDRESS])); return 0; } static int br_mdb_config_src_list_init(struct nlattr *src_list, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *src_entry; int rem, err; int i = 0; nla_for_each_nested(src_entry, src_list, rem) cfg->num_src_entries++; if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", PG_SRC_ENT_LIMIT - 1); return -EINVAL; } cfg->src_entries = kcalloc(cfg->num_src_entries, sizeof(struct br_mdb_src_entry), GFP_KERNEL); if (!cfg->src_entries) return -ENOMEM; nla_for_each_nested(src_entry, src_list, rem) { err = br_mdb_config_src_entry_init(src_entry, &cfg->src_entries[i], cfg->entry->addr.proto, extack); if (err) goto err_src_entry_init; i++; } return 0; err_src_entry_init: kfree(cfg->src_entries); return err; } static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) { kfree(cfg->src_entries); } static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; int err; err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs, br_mdbe_attrs_pol, extack); if (err) return err; if (mdb_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], cfg->entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); return -EINVAL; } cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); } else { cfg->filter_mode = MCAST_EXCLUDE; } if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); return -EINVAL; } if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); return -EINVAL; } err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], cfg, extack); if (err) return err; } if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); return -EINVAL; } if (mdb_attrs[MDBE_ATTR_RTPROT]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); return -EINVAL; } cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); } return 0; } static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; cfg->nlflags = nlmsg_flags; cfg->br = netdev_priv(dev); if (!netif_running(cfg->br->dev)) { NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); return -EINVAL; } if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); return -EINVAL; } cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); if (cfg->entry->ifindex != cfg->br->dev->ifindex) { struct net_device *pdev; pdev = __dev_get_by_index(net, cfg->entry->ifindex); if (!pdev) { NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; } cfg->p = br_port_get_rtnl(pdev); if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); return -EINVAL; } if (cfg->p->br != cfg->br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } if (cfg->entry->addr.proto == htons(ETH_P_IP) && ipv4_is_zeronet(cfg->entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed"); return -EINVAL; } if (tb[MDBA_SET_ENTRY_ATTRS]) return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg, extack); else __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL); return 0; } static void br_mdb_config_fini(struct br_mdb_config *cfg) { br_mdb_config_src_list_fini(cfg); } int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack); if (err) return err; err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); goto out; } vg = nbp_vlan_group(cfg.p); } else { vg = br_vlan_group(cfg.br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_add(&cfg, extack); if (err) break; } } else { err = __br_mdb_add(&cfg, extack); } out: br_mdb_config_fini(&cfg); return err; } static int __br_mdb_del(const struct br_mdb_config *cfg) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge *br = cfg->br; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip ip = cfg->group; int err = -EINVAL; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; /* host leave */ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; br_multicast_del_pg(mp, p, pp); err = 0; break; } unlock: spin_unlock_bh(&br->multicast_lock); return err; } int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, 0, extack); if (err) return err; if (cfg.p) vg = nbp_vlan_group(cfg.p); else vg = br_vlan_group(cfg.br); /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_del(&cfg); } } else { err = __br_mdb_del(&cfg); } br_mdb_config_fini(&cfg); return err; } struct br_mdb_flush_desc { u32 port_ifindex; u16 vid; u8 rt_protocol; u8 state; u8 state_mask; }; static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), }; static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; desc->port_ifindex = entry->ifindex; desc->vid = entry->vid; desc->state = entry->state; if (!tb[MDBA_SET_ENTRY_ATTRS]) return 0; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], br_mdbe_attrs_del_bulk_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); if (mdbe_attrs[MDBE_ATTR_RTPROT]) desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); return 0; } static void br_mdb_flush_host(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { u8 state; if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex) return; if (desc->rt_protocol) return; state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) return; br_multicast_host_leave(mp, true); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_mdb_flush_pgs(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) { u8 state; if (desc->port_ifindex && desc->port_ifindex != p->key.port->dev->ifindex) { pp = &p->next; continue; } if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) { pp = &p->next; continue; } state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) { pp = &p->next; continue; } br_multicast_del_pg(mp, p, pp); } } static void br_mdb_flush(struct net_bridge *br, const struct br_mdb_flush_desc *desc) { struct net_bridge_mdb_entry *mp; spin_lock_bh(&br->multicast_lock); /* Safe variant is not needed because entries are removed from the list * upon group timer expiration or bridge deletion. */ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { if (desc->vid && desc->vid != mp->addr.vid) continue; br_mdb_flush_host(br, mp, desc); br_mdb_flush_pgs(br, mp, desc); } spin_unlock_bh(&br->multicast_lock); } int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct br_mdb_flush_desc desc = {}; int err; err = br_mdb_flush_desc_init(&desc, tb, extack); if (err) return err; br_mdb_flush(br, &desc); return 0; } static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[], struct br_ip *group, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; if (!tb[MDBA_GET_ENTRY_ATTRS]) { __mdb_entry_to_br_ip(entry, group, NULL); return 0; } err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(entry, group, mdbe_attrs); return 0; } static struct sk_buff * br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp) { struct net_bridge_port_group *pg; size_t nlmsg_size; nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0); if (mp->host_joined) nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL); for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg); return nlmsg_new(nlmsg_size, GFP_ATOMIC); } static int br_mdb_get_reply_fill(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, u32 portid, u32 seq) { struct nlattr *mdb_nest, *mdb_entry_nest; struct net_bridge_port_group *pg; struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = mp->br->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) { err = -EMSGSIZE; goto cancel; } mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) { err = -EMSGSIZE; goto cancel; } if (mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) goto cancel; } for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) { err = __mdb_fill_info(skb, mp, pg); if (err) goto cancel; } nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return err; } int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct sk_buff *skb; struct br_ip group; int err; err = br_mdb_get_parse(dev, tb, &group, extack); if (err) return err; /* Hold the multicast lock to ensure that the MDB entry does not change * between the time the reply size is determined and when the reply is * filled in. */ spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &group); if (!mp || (!mp->ports && !mp->host_joined)) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); err = -ENOENT; goto unlock; } skb = br_mdb_get_reply_alloc(mp); if (!skb) { err = -ENOMEM; goto unlock; } err = br_mdb_get_reply_fill(skb, mp, portid, seq); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply"); goto free; } spin_unlock_bh(&br->multicast_lock); return rtnl_unicast(skb, dev_net(dev), portid); free: kfree_skb(skb); unlock: spin_unlock_bh(&br->multicast_lock); return err; }
5 5 5 2 3 4 4 5 3 4 1 6 1 2 2 4 5 5 12 3 7 1 1 3 2 2 2 2 2 2 2 2 2 2 4 4 4 2 2 4 4 4 4 3 4 4 1 4 4 4 4 1 1 19 19 19 10 5 2 6 4 4 4 21 2 1 18 7 7 7 2 2 1 1 32 3 27 29 28 3 1 1 3 6 3 3 2 1 1 2 14 6 3 20 14 6 20 20 12 12 20 15 5 12 8 20 20 20 19 1 20 20 20 20 20 1 19 17 18 28 23 5 1 5 21 2 1 1 2 14 2 1 14 15 15 15 15 20 1 21 22 22 21 52 5 33 12 2 44 44 5 2 42 4 2 1 2 40 1 38 40 1 28 11 2 3 5 7 22 11 15 22 2 1 3 1 3 25 20 6 43 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> #include <linux/fsnotify_backend.h> #include <linux/init.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/memcontrol.h> #include <linux/statfs.h> #include <linux/exportfs.h> #include <asm/ioctls.h> #include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 /* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable * limit of marks per user, similar to inotify. Effectively, the legacy limit * of fanotify marks per user is <max marks per group> * <max groups per user>. * This default limit (1M) also happens to match the increased limit of inotify * max_user_watches since v5.10. */ #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) /* * Most of the memory cost of adding an inode mark is pinning the marked inode. * The size of the filesystem inode struct is not uniform across filesystems, * so double the size of a VFS inode is used as a conservative approximation. */ #define INODE_MARK_COST (2 * sizeof(struct inode)) /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long ft_zero = 0; static long ft_int_max = INT_MAX; static const struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &ft_zero, .extra2 = &ft_int_max, }, { .procname = "max_user_marks", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &ft_zero, .extra2 = &ft_int_max, }, { .procname = "max_queued_events", .data = &fanotify_max_queued_events, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, }; static void __init fanotify_sysctls_init(void) { register_sysctl("fs/fanotify", fanotify_table); } #else #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. * Internal flags like FMODE_EXEC shall be excluded. */ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ O_ACCMODE | O_APPEND | O_NONBLOCK | \ __O_SYNC | O_DSYNC | O_CLOEXEC | \ O_LARGEFILE | O_NOATIME ) extern const struct fsnotify_ops fanotify_fsnotify_ops; struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) #define FANOTIFY_PIDFD_INFO_LEN \ sizeof(struct fanotify_event_info_pidfd) #define FANOTIFY_ERROR_INFO_LEN \ (sizeof(struct fanotify_event_info_error)) #define FANOTIFY_RANGE_INFO_LEN \ (sizeof(struct fanotify_event_info_range)) #define FANOTIFY_MNT_INFO_LEN \ (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { int info_len = fh_len; if (name_len) info_len += name_len + 1; return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); } /* FAN_RENAME may have one or two dir+name info records */ static int fanotify_dir_name_info_len(struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); int dir_fh_len = fanotify_event_dir_fh_len(event); int dir2_fh_len = fanotify_event_dir2_fh_len(event); int info_len = 0; if (dir_fh_len) info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); if (dir2_fh_len) info_len += fanotify_fid_info_len(dir2_fh_len, info->name2_len); return info_len; } static size_t fanotify_event_len(unsigned int info_mode, struct fanotify_event *event) { size_t event_len = FAN_EVENT_METADATA_LEN; int fh_len; int dot_len = 0; if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; if (fanotify_event_has_any_dir_fh(event)) { event_len += fanotify_dir_name_info_len(event); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". */ dot_len = 1; } if (fanotify_event_has_object_fh(event)) { fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } if (fanotify_is_mnt_event(event->mask)) event_len += FANOTIFY_MNT_INFO_LEN; if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; if (fanotify_event_has_access_range(event)) event_len += FANOTIFY_RANGE_INFO_LEN; return event_len; } /* * Remove an hashed event from merge hash table. */ static void fanotify_unhash_event(struct fsnotify_group *group, struct fanotify_event *event) { assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, fanotify_event_hash_bucket(group, event)); if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) return; hlist_del_init(&event->merge_list); } /* * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is * updated accordingly. */ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); spin_lock(&group->notification_lock); fsn_event = fsnotify_peek_first_event(group); if (!fsn_event) goto out; event = FANOTIFY_E(fsn_event); event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); goto out; } /* * Held the notification_lock the whole time, so this is the * same event we peeked above. */ fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; if (fanotify_is_hashed_event(event->mask)) fanotify_unhash_event(group, event); out: spin_unlock(&group->notification_lock); return event; } static int create_fd(struct fsnotify_group *group, const struct path *path, struct file **file) { int client_fd; struct file *new_file; client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; /* * We provide an fd for the userspace program, so it could access the * file without generating fanotify events itself. */ new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, current_cred()); if (IS_ERR(new_file)) { put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); } else { *file = new_file; } return client_fd; } static int process_access_response_info(const char __user *info, size_t info_len, struct fanotify_response_info_audit_rule *friar) { if (info_len != sizeof(*friar)) return -EINVAL; if (copy_from_user(friar, info, sizeof(*friar))) return -EFAULT; if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) return -EINVAL; if (friar->hdr.pad != 0) return -EINVAL; if (friar->hdr.len != sizeof(*friar)) return -EINVAL; return info_len; } /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, struct fanotify_perm_event *event, u32 response, struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); event->response = response & ~FAN_INFO; if (response & FAN_INFO) memcpy(&event->audit_rule, friar, sizeof(*friar)); if (event->state == FAN_EVENT_CANCELED) destroy = true; else event->state = FAN_EVENT_ANSWERED; spin_unlock(&group->notification_lock); if (destroy) fsnotify_destroy_event(group, &event->fae.fse); } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct, const char __user *info, size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; u32 response = response_struct->response; int errno = fanotify_get_response_errno(response); int ret = info_len; struct fanotify_response_info_audit_rule friar; pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", __func__, group, fd, response, errno, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ if (response & ~FANOTIFY_RESPONSE_VALID_MASK) return -EINVAL; switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: if (errno) return -EINVAL; break; case FAN_DENY: /* Custom errno is supported only for pre-content groups */ if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) return -EINVAL; /* * Limit errno to values expected on open(2)/read(2)/write(2) * of regular files. */ switch (errno) { case 0: case EIO: case EPERM: case EBUSY: case ETXTBSY: case EAGAIN: case ENOSPC: case EDQUOT: break; default: return -EINVAL; } break; default: return -EINVAL; } if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; if (response & FAN_INFO) { ret = process_access_response_info(info, info_len, &friar); if (ret < 0) return ret; if (fd == FAN_NOFD) return ret; } else { ret = 0; } if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) continue; list_del_init(&event->fae.fse.list); finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); return ret; } spin_unlock(&group->notification_lock); return -ENOENT; } static size_t copy_mnt_info_to_user(struct fanotify_event *event, char __user *buf, int count) { struct fanotify_event_info_mnt info = { }; info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; info.hdr.len = FANOTIFY_MNT_INFO_LEN; if (WARN_ON(count < info.hdr.len)) return -EFAULT; info.mnt_id = FANOTIFY_ME(event)->mnt_id; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; return info.hdr.len; } static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { struct fanotify_event_info_error info = { }; struct fanotify_error_event *fee = FANOTIFY_EE(event); info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; info.hdr.len = FANOTIFY_ERROR_INFO_LEN; if (WARN_ON(count < info.hdr.len)) return -EFAULT; info.error = fee->error; info.error_count = fee->err_count; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; return info.hdr.len; } static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, int info_type, const char *name, size_t name_len, char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; size_t fh_len = fh ? fh->len : 0; size_t info_len = fanotify_fid_info_len(fh_len, name_len); size_t len = info_len; pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; /* * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename. */ switch (info_type) { case FAN_EVENT_INFO_TYPE_FID: case FAN_EVENT_INFO_TYPE_DFID: if (WARN_ON_ONCE(name_len)) return -EFAULT; break; case FAN_EVENT_INFO_TYPE_DFID_NAME: case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: if (WARN_ON_ONCE(!name || !name_len)) return -EFAULT; break; default: return -EFAULT; } info.hdr.info_type = info_type; info.hdr.len = len; info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; buf += sizeof(info); len -= sizeof(info); if (WARN_ON_ONCE(len < sizeof(handle))) return -EFAULT; handle.handle_type = fh->type; handle.handle_bytes = fh_len; /* Mangle handle_type for bad file_handle */ if (!fh_len) handle.handle_type = FILEID_INVALID; if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; buf += sizeof(handle); len -= sizeof(handle); if (WARN_ON_ONCE(len < fh_len)) return -EFAULT; /* * For an inline fh and inline file name, copy through stack to exclude * the copy from usercopy hardening protections. */ fh_buf = fanotify_fh_buf(fh); if (fh_len <= FANOTIFY_INLINE_FH_LEN) { memcpy(bounce, fh_buf, fh_len); fh_buf = bounce; } if (copy_to_user(buf, fh_buf, fh_len)) return -EFAULT; buf += fh_len; len -= fh_len; if (name_len) { /* Copy the filename with terminating null */ name_len++; if (WARN_ON_ONCE(len < name_len)) return -EFAULT; if (copy_to_user(buf, name, name_len)) return -EFAULT; buf += name_len; len -= name_len; } /* Pad with 0's */ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); if (len > 0 && clear_user(buf, len)) return -EFAULT; return info_len; } static int copy_pidfd_info_to_user(int pidfd, char __user *buf, size_t count) { struct fanotify_event_info_pidfd info = { }; size_t info_len = FANOTIFY_PIDFD_INFO_LEN; if (WARN_ON_ONCE(info_len > count)) return -EFAULT; info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; info.hdr.len = info_len; info.pidfd = pidfd; if (copy_to_user(buf, &info, info_len)) return -EFAULT; return info_len; } static size_t copy_range_info_to_user(struct fanotify_event *event, char __user *buf, int count) { struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); struct fanotify_event_info_range info = { }; size_t info_len = FANOTIFY_RANGE_INFO_LEN; if (WARN_ON_ONCE(info_len > count)) return -EFAULT; if (WARN_ON_ONCE(!pevent->ppos)) return -EINVAL; info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; info.hdr.len = info_len; info.offset = *(pevent->ppos); info.count = pevent->count; if (copy_to_user(buf, &info, info_len)) return -EFAULT; return info_len; } static int copy_info_records_to_user(struct fanotify_event *event, struct fanotify_info *info, unsigned int info_mode, int pidfd, char __user *buf, size_t count) { int ret, total_bytes = 0, info_type = 0; unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; /* * Event info records order is as follows: * 1. dir fid + name * 2. (optional) new dir fid + new name * 3. (optional) child fid */ if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; /* FAN_RENAME uses special info types */ if (event->mask & FAN_RENAME) info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } /* New dir fid+name may be reported in addition to old dir fid+name */ if (fanotify_event_has_dir2_fh(event)) { info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir2_fh(info), info_type, fanotify_info_name2(info), info->name2_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; if (fid_mode == FAN_REPORT_FID || info_type) { /* * With only group flag FAN_REPORT_FID only type FID is * reported. Second info record type is always FID. */ info_type = FAN_EVENT_INFO_TYPE_FID; } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not * recorded in an event on a directory, report the name * "." with info type DFID_NAME. */ info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; dot = "."; dot_len = 1; } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_DIR_FID, a single info * record has type DFID for directory entry modification * event and for event on a directory. */ info_type = FAN_EVENT_INFO_TYPE_DFID; } else { /* * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, * a single info record has type FID for event on a * non-directory, when there is no directory to report. * For example, on FAN_DELETE_SELF event. */ info_type = FAN_EVENT_INFO_TYPE_FID; } ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), info_type, dot, dot_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (pidfd_mode) { ret = copy_pidfd_info_to_user(pidfd, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_is_error_event(event->mask)) { ret = copy_error_info_to_user(event, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_event_has_access_range(event)) { ret = copy_range_info_to_user(event, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_is_mnt_event(event->mask)) { ret = copy_mnt_info_to_user(event, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } return total_bytes; } static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; const struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL, *pidfd_file = NULL; int ret, pidfd = -ESRCH, fd = -EBADF; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); /* * For an unprivileged listener, event->pid can be used to identify the * events generated by the listener process itself, without disclosing * the pids of other processes. */ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && task_tgid(current) != event->pid) metadata.pid = 0; /* * For now, fid mode is required for an unprivileged listener and * fid mode does not report fd in events. Keep this check anyway * for safety in case fid mode requirement is relaxed in the future * to allow unprivileged listener to get events with no fd and no fid. */ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); /* * Opening an fd from dentry can fail for several reasons. * For example, when tasks are gone and we try to open their * /proc files or we try to open a WRONLY file like in sysfs * or when trying to open a file that was deleted on the * remote network server. * * For a group with FAN_REPORT_FD_ERROR, we will send the * event with the error instead of the open fd, otherwise * Userspace may not get the error at all. * In any case, userspace will not know which file failed to * open, so add a debug print for further investigation. */ if (fd < 0) { pr_debug("fanotify: create_fd(%pd2) failed err=%d\n", path->dentry, fd); if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { /* * Historically, we've handled EOPENSTALE in a * special way and silently dropped such * events. Now we have to keep it to maintain * backward compatibility... */ if (fd == -EOPENSTALE) fd = 0; return fd; } } } if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) metadata.fd = fd; else metadata.fd = fd >= 0 ? fd : FAN_NOFD; if (pidfd_mode) { /* * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual * exclusion is ever lifted. At the time of incoporating pidfd * support within fanotify, the pidfd API only supported the * creation of pidfds for thread-group leaders. */ WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); /* * The PIDTYPE_TGID check for an event->pid is performed * preemptively in an attempt to catch out cases where the event * listener reads events after the event generating process has * already terminated. Depending on flag FAN_REPORT_FD_ERROR, * report either -ESRCH or FAN_NOPIDFD to the event listener in * those cases with all other pidfd creation errors reported as * the error code itself or as FAN_EPIDFD. */ if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID)) pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0) pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD; } ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and * event_len sizes ever get out of sync. */ if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); if (ret < 0) goto out_close_fd; if (f) fd_install(fd, f); if (pidfd_file) fd_install(pidfd, pidfd_file); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->fd = fd; return metadata.event_len; out_close_fd: if (f) { put_unused_fd(fd); fput(f); } if (pidfd_file) { put_unused_fd(pidfd); fput(pidfd_file); } return ret; } /* intofiy userspace file descriptor functions */ static __poll_t fanotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = EPOLLIN | EPOLLRDNORM; spin_unlock(&group->notification_lock); return ret; } static ssize_t fanotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; struct fanotify_event *event; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); add_wait_queue(&group->notification_waitq, &wait); while (1) { /* * User can supply arbitrarily large buffer. Avoid softlockups * in case there are lots of available events. */ cond_resched(); event = get_one_event(group, count); if (IS_ERR(event)) { ret = PTR_ERR(event); break; } if (!event) { ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; if (start != buf) break; wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } ret = copy_event_to_user(group, event, buf, count); /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!fanotify_is_perm_event(event->mask)) { fsnotify_destroy_event(group, &event->fse); } else { if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) { spin_lock(&group->notification_lock); finish_permission_event(group, FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); spin_unlock(&group->notification_lock); } } if (ret < 0) break; buf += ret; count -= ret; } remove_wait_queue(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; } static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { struct fanotify_response response; struct fsnotify_group *group; int ret; const char __user *info_buf = buf + sizeof(struct fanotify_response); size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; pr_debug("%s: group=%p count=%zu\n", __func__, group, count); if (count < sizeof(response)) return -EINVAL; if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; info_len = count - sizeof(response); ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; else count = sizeof(response) + ret; return count; } static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since * userspace cannot use fanotify fd anymore, no event can enter or * leave access_list by now either. */ fsnotify_group_stop_queueing(group); /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. */ spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { struct fanotify_perm_event *event; event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } /* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ while ((fsn_event = fsnotify_remove_first_event(group))) { struct fanotify_event *event = FANOTIFY_E(fsn_event); if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); return 0; } static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; group = file->private_data; p = (void __user *) arg; switch (cmd) { case FIONREAD: spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } return ret; } static const struct file_operations fanotify_fops = { .show_fdinfo = fanotify_show_fdinfo, .poll = fanotify_poll, .read = fanotify_read, .write = fanotify_write, .fasync = NULL, .release = fanotify_release, .unlocked_ioctl = fanotify_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags, __u64 mask, unsigned int obj_type) { int ret; pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, dfd, filename, flags); if (filename == NULL) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; if ((flags & FAN_MARK_ONLYDIR) && !(S_ISDIR(file_inode(fd_file(f))->i_mode))) return -ENOTDIR; *path = fd_file(f)->f_path; path_get(path); } else { unsigned int lookup_flags = 0; if (!(flags & FAN_MARK_DONT_FOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flags & FAN_MARK_ONLYDIR) lookup_flags |= LOOKUP_DIRECTORY; ret = user_path_at(dfd, filename, lookup_flags, path); if (ret) goto out; } /* you can only watch an inode if you have read permissions on it */ ret = path_permission(path, MAY_READ); if (ret) { path_put(path); goto out; } ret = security_path_notify(path, mask, obj_type); if (ret) path_put(path); out: return ret; } static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags, __u32 umask, int *destroy) { __u32 oldmask, newmask; /* umask bits cannot be removed by user */ mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* * We need to keep the mark around even if remaining mask cannot * result in any events (e.g. mask == FAN_ONDIR) to support incremenal * changes to the mask. * Destroy mark when only umask bits remain. */ *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; } static int fanotify_remove_mark(struct fsnotify_group *group, void *obj, unsigned int obj_type, __u32 mask, unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; __u32 removed; int destroy_mark; fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { fsnotify_group_unlock(group); return -ENOENT; } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, umask, &destroy_mark); if (removed & fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); if (destroy_mark) fsnotify_detach_mark(fsn_mark); fsnotify_group_unlock(group); if (destroy_mark) fsnotify_free_mark(fsn_mark); /* matches the fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); return 0; } static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* * When using FAN_MARK_IGNORE for the first time, mark starts using * independent event flags in ignore mask. After that, trying to * update the ignore mask with the old FAN_MARK_IGNORED_MASK API * will result in EEXIST error. */ if (ignore == FAN_MARK_IGNORE) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set * because of an ignore mask that is now going to survive FS_MODIFY. */ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) recalc = true; } if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) return recalc; /* * NO_IREF may be removed from a mark, but not added. * When removed, fsnotify_recalc_mask() will take the inode ref. */ WARN_ON_ONCE(!want_iref); fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; return true; } static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int fan_flags) { bool recalc; spin_lock(&fsn_mark->lock); if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); spin_unlock(&fsn_mark->lock); return recalc; } struct fan_fsid { struct super_block *sb; __kernel_fsid_t id; bool weak; }; static int fanotify_set_mark_fsid(struct fsnotify_group *group, struct fsnotify_mark *mark, struct fan_fsid *fsid) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *old; struct super_block *old_sb = NULL; FANOTIFY_MARK(mark)->fsid = fsid->id; mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; if (fsid->weak) mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; /* First mark added will determine if group is single or multi fsid */ if (list_empty(&group->marks_list)) return 0; /* Find sb of an existing mark */ list_for_each_entry(old, &group->marks_list, g_list) { conn = READ_ONCE(old->connector); if (!conn) continue; old_sb = fsnotify_connector_sb(conn); if (old_sb) break; } /* Only detached marks left? */ if (!old_sb) return 0; /* Do not allow mixing of marks with weak and strong fsid */ if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) return -EXDEV; /* Allow mixing of marks with strong fsid from different fs */ if (!fsid->weak) return 0; /* Do not allow mixing marks with weak fsid from different fs */ if (old_sb != fsid->sb) return -EXDEV; /* Do not allow mixing marks from different btrfs sub-volumes */ if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, &FANOTIFY_MARK(mark)->fsid)) return -EXDEV; return 0; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, void *obj, unsigned int obj_type, unsigned int fan_flags, struct fan_fsid *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; /* * Enforce per user marks limits per user in all containing user ns. * A group with FAN_UNLIMITED_MARKS does not contribute to mark count * in the limited groups account. */ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fan_mark) { ret = -ENOMEM; goto out_dec_ucounts; } mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; /* Cache fsid of filesystem containing the marked object */ if (fsid) { ret = fanotify_set_mark_fsid(group, mark, fsid); if (ret) goto out_put_mark; } else { fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); if (ret) goto out_put_mark; return mark; out_put_mark: fsnotify_put_mark(mark); out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); return ERR_PTR(ret); } static int fanotify_group_init_error_pool(struct fsnotify_group *group) { if (mempool_initialized(&group->fanotify_data.error_events_pool)) return 0; return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, FANOTIFY_DEFAULT_FEE_POOL_SIZE, sizeof(struct fanotify_error_event)); } static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int fan_flags) { /* * Non evictable mark cannot be downgraded to evictable mark. */ if (fan_flags & FAN_MARK_EVICTABLE && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) return -EEXIST; /* * New ignore mask semantics cannot be downgraded to old semantics. */ if (fan_flags & FAN_MARK_IGNORED_MASK && fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return -EEXIST; /* * An ignore mask that survives modify could never be downgraded to not * survive modify. With new FAN_MARK_IGNORE semantics we make that rule * explicit and return an error when trying to update the ignore mask * without the original FAN_MARK_IGNORED_SURV_MODIFY value. */ if (fan_flags & FAN_MARK_IGNORE && !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) return -EEXIST; /* For now pre-content events are not generated for directories */ mask |= fsn_mark->mask; if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) return -EEXIST; return 0; } static int fanotify_add_mark(struct fsnotify_group *group, void *obj, unsigned int obj_type, __u32 mask, unsigned int fan_flags, struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; int ret = 0; fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, obj, obj_type, fan_flags, fsid); if (IS_ERR(fsn_mark)) { fsnotify_group_unlock(group); return PTR_ERR(fsn_mark); } } /* * Check if requested mark flags conflict with an existing mark flags. */ ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); if (ret) goto out; /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; } recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); if (recalc) fsnotify_recalc_mask(fsn_mark->connector); out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); return ret; } static struct fsnotify_event *fanotify_alloc_overflow_event(void) { struct fanotify_event *oevent; oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); if (!oevent) return NULL; fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; return &oevent->fse; } static struct hlist_head *fanotify_alloc_merge_hash(void) { struct hlist_head *hash; hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, GFP_KERNEL_ACCOUNT); if (!hash) return NULL; __hash_init(hash, FANOTIFY_HTABLE_SIZE); return hash; } /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct user_namespace *user_ns = current_user_ns(); struct fsnotify_group *group; int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; struct file *file; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) { /* * An unprivileged user can setup an fanotify group with * limited functionality - an unprivileged group is limited to * notification events with file handles or mount ids and it * cannot use unlimited queue/marks. */ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) return -EPERM; /* * Setting the internal flag FANOTIFY_UNPRIV on the group * prevents setting mount/filesystem marks on this group and * prevents reporting pid and open fd in events. */ internal_flags |= FANOTIFY_UNPRIV; } #ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL; /* * A pidfd can only be returned for a thread-group leader; thus * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually * exclusive. */ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; /* Don't allow mixing mnt events with inode events for now */ if (flags & FAN_REPORT_MNT) { if (class != FAN_CLASS_NOTIF) return -EINVAL; if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) return -EINVAL; } if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; switch (event_f_flags & O_ACCMODE) { case O_RDONLY: case O_RDWR: case O_WRONLY: break; default: return -EINVAL; } if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; /* * Child name is reported with parent fid so requires dir fid. * We can report both child fid and dir fid with or without name. */ if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; /* * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID * and is used as an indication to report both dir and child fid on all * dirent events. */ if ((fid_mode & FAN_REPORT_TARGET_FID) && (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; f_flags = O_RDWR; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_group(&fanotify_fsnotify_ops, FSNOTIFY_GROUP_USER); if (IS_ERR(group)) { return PTR_ERR(group); } /* Enforce groups limits per user in all containing user ns */ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), UCOUNT_FANOTIFY_GROUPS); if (!group->fanotify_data.ucounts) { fd = -EMFILE; goto out_destroy_group; } group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); group->user_ns = get_user_ns(user_ns); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); if (!group->fanotify_data.merge_hash) { fd = -ENOMEM; goto out_destroy_group; } group->overflow_event = fanotify_alloc_overflow_event(); if (unlikely(!group->overflow_event)) { fd = -ENOMEM; goto out_destroy_group; } if (force_o_largefile()) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FSNOTIFY_PRIO_NORMAL; break; case FAN_CLASS_CONTENT: group->priority = FSNOTIFY_PRIO_CONTENT; break; case FAN_CLASS_PRE_CONTENT: group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: fd = -EINVAL; goto out_destroy_group; } BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); if (flags & FAN_UNLIMITED_QUEUE) { group->max_events = UINT_MAX; } else { group->max_events = fanotify_max_queued_events; } if (flags & FAN_ENABLE_AUDIT) { fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) goto out_destroy_group; } fd = get_unused_fd_flags(f_flags); if (fd < 0) goto out_destroy_group; file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, f_flags, FMODE_NONOTIFY); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto out_destroy_group; } fd_install(fd, file); return fd; out_destroy_group: fsnotify_destroy_group(group); return fd; } static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, struct fan_fsid *fsid) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; fsid->sb = dentry->d_sb; if (!fsid->id.val[0] && !fsid->id.val[1]) { err = -ENODEV; goto weak; } /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { err = -EXDEV; goto weak; } fsid->weak = false; return 0; weak: /* Allow weak fsid when marking inodes */ fsid->weak = true; return (mark_type == FAN_MARK_INODE) ? 0 : err; } /* Check if filesystem can encode a unique fid */ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; const struct export_operations *nop = dentry->d_sb->s_export_op; /* * We need to make sure that the filesystem supports encoding of * file handles so user can use name_to_handle_at() to compare fids * reported with events to the file handle of watched objects. */ if (!exportfs_can_encode_fid(nop)) return -EOPNOTSUPP; /* * For sb/mount mark, we also need to make sure that the filesystem * supports decoding file handles, so user has a way to map back the * reported fids to filesystem objects. */ if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP; return 0; } static int fanotify_events_supported(struct fsnotify_group *group, const struct path *path, __u64 mask, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; bool is_dir = d_is_dir(path->dentry); /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || (mask & FAN_RENAME) || (flags & FAN_MARK_IGNORE); /* * Filesystems need to opt-into pre-content evnets (a.k.a HSM) * and they are only supported on regular files and directories. */ if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) return -EOPNOTSUPP; if (!is_dir && !d_is_reg(path->dentry)) return -EINVAL; } /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of * deadlocking the system - open done when reporting fanotify event * blocks on this "unusual" lock while another process holding the lock * waits for fanotify permission event to be answered. Just disallow * permission events for such filesystems. */ if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; /* * mount and sb marks are not allowed on kernel internal pseudo fs, * like pipe_mnt, because that would subscribe to events on all the * anonynous pipes in the system. * * SB_NOUSER covers all of the internal pseudo fs whose objects are not * exposed to user's mount namespace, but there are other SB_KERNMOUNT * fs, like nsfs, debugfs, for which the value of allowing sb and mount * mark is questionable. For now we leave them alone. */ if (mark_type != FAN_MARK_INODE && path->mnt->mnt_sb->s_flags & SB_NOUSER) return -EINVAL; /* * We shouldn't have allowed setting dirent events and the directory * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, * but because we always allowed it, error only when using new APIs. */ if (strict_dir_events && mark_type == FAN_MARK_INODE && !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) return -ENOTDIR; return 0; } static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; struct user_namespace *user_ns = NULL; struct mnt_namespace *mntns; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; void *obj = NULL; u32 umask = 0; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", __func__, fanotify_fd, flags, dfd, pathname, mask); /* we only use the lower 32 bits as of right now. */ if (upper_32_bits(mask)) return -EINVAL; if (flags & ~FANOTIFY_MARK_FLAGS) return -EINVAL; switch (mark_type) { case FAN_MARK_INODE: obj_type = FSNOTIFY_OBJ_TYPE_INODE; break; case FAN_MARK_MOUNT: obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; break; case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; case FAN_MARK_MNTNS: obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; break; default: return -EINVAL; } switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; break; case FAN_MARK_FLUSH: if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL; } if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) valid_mask |= FANOTIFY_PERM_EVENTS; if (mask & ~valid_mask) return -EINVAL; /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) return -EINVAL; /* * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with * FAN_MARK_IGNORED_MASK. */ if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; umask = FANOTIFY_EVENT_FLAGS; } CLASS(fd, f)(fanotify_fd); if (fd_empty(f)) return -EBADF; /* verify that this is indeed an fanotify instance */ if (unlikely(fd_file(f)->f_op != &fanotify_fops)) return -EINVAL; group = fd_file(f)->private_data; /* Only report mount events on mnt namespace */ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { if (mask & ~FANOTIFY_MOUNT_EVENTS) return -EINVAL; if (mark_type != FAN_MARK_MNTNS) return -EINVAL; } else { if (mask & FANOTIFY_MOUNT_EVENTS) return -EINVAL; if (mark_type == FAN_MARK_MNTNS) return -EINVAL; } /* * A user is allowed to setup sb/mount/mntns marks only if it is * capable in the user ns where the group was created. */ if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) && mark_type != FAN_MARK_INODE) return -EPERM; /* * Permission events are not allowed for FAN_CLASS_NOTIF. * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. */ if (mask & FANOTIFY_PERM_EVENTS && group->priority == FSNOTIFY_PRIO_NORMAL) return -EINVAL; else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && group->priority == FSNOTIFY_PRIO_CONTENT) return -EINVAL; if (mask & FAN_FS_ERROR && mark_type != FAN_MARK_FILESYSTEM) return -EINVAL; /* * Evictable is only relevant for inode marks, because only inode object * can be evicted on memory pressure. */ if (flags & FAN_MARK_EVICTABLE && mark_type != FAN_MARK_INODE) return -EINVAL; /* * Events that do not carry enough information to report * event->fd require a group that supports reporting fid. Those * events are not supported on a mount mark, because they do not * carry enough information (i.e. path) to be filtered by mount * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL; /* * FAN_RENAME uses special info type records to report the old and * new parent+name. Reporting only old and new parent id is less * useful and was not implemented. */ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) return -EINVAL; /* Pre-content events are not currently generated for directories. */ if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) return -EINVAL; if (mark_cmd == FAN_MARK_FLUSH) { fsnotify_clear_marks_by_group(group, obj_type); return 0; } ret = fanotify_find_path(dfd, pathname, &path, flags, (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) return ret; if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } if (fid_mode) { ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out; ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; fsid = &__fsid; } /* * In addition to being capable in the user ns where group was created, * the user also needs to be capable in the user ns associated with * the filesystem or in the user ns associated with the mntns * (when marking mntns). */ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt; } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt->mnt_sb; } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { mntns = mnt_ns_from_dentry(path.dentry); user_ns = mntns->user_ns; obj = mntns; } ret = -EPERM; if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN)) goto path_put_and_out; ret = -EINVAL; if (!obj) goto path_put_and_out; /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE && (!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out; ret = 0; if (inode && inode_is_open_for_write(inode)) goto path_put_and_out; } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (!inode || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* * If group needs to report parent fid, register for getting * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD: ret = fanotify_add_mark(group, obj, obj_type, mask, flags, fsid); break; case FAN_MARK_REMOVE: ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, umask); break; default: ret = -EINVAL; } path_put_and_out: path_put(&path); return ret; } #ifndef CONFIG_ARCH_SPLIT_ARG64 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u64, mask, int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); } #endif #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) SYSCALL32_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, SC_ARG64(mask), int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), dfd, pathname); } #endif /* * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ static int __init fanotify_user_setup(void) { struct sysinfo si; int max_marks; si_meminfo(&si); /* * Allow up to 1% of addressable memory to be accounted for per user * marks limited to the range [8192, 1048576]. mount and sb marks are * a lot cheaper than inode marks, but there is no reason for a user * to have many of those, so calculate by the cost of inode marks. */ max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / INODE_MARK_COST; max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; fanotify_sysctls_init(); return 0; } device_initcall(fanotify_user_setup);
2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ELEVATOR_H #define _ELEVATOR_H #include <linux/percpu.h> #include <linux/hashtable.h> #include "blk-mq.h" struct io_cq; struct elevator_type; struct blk_mq_debugfs_attr; /* * Return values from elevator merger */ enum elv_merge { ELEVATOR_NO_MERGE = 0, ELEVATOR_FRONT_MERGE = 1, ELEVATOR_BACK_MERGE = 2, ELEVATOR_DISCARD_MERGE = 3, }; struct blk_mq_alloc_data; struct blk_mq_hw_ctx; struct elevator_tags { /* num. of hardware queues for which tags are allocated */ unsigned int nr_hw_queues; /* depth used while allocating tags */ unsigned int nr_requests; /* shared tag is stored at index 0 */ struct blk_mq_tags *tags[]; }; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; #define ELV_NAME_MAX (16) struct elv_fs_entry { struct attribute attr; ssize_t (*show)(struct elevator_queue *, char *); ssize_t (*store)(struct elevator_queue *, const char *, size_t); }; /* * identifies an elevator type, such as AS or deadline */ struct elevator_type { /* managed by elevator core */ struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ struct elevator_mq_ops ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; #endif /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ struct list_head list; }; static inline bool elevator_tryget(struct elevator_type *e) { return try_module_get(e->elevator_owner); } static inline void __elevator_get(struct elevator_type *e) { __module_get(e->elevator_owner); } static inline void elevator_put(struct elevator_type *e) { module_put(e->elevator_owner); } #define ELV_HASH_BITS 6 void elv_rqhash_del(struct request_queue *q, struct request *rq); void elv_rqhash_add(struct request_queue *q, struct request *rq); void elv_rqhash_reposition(struct request_queue *q, struct request *rq); struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); /* * each queue has an elevator_queue associated with it */ struct elevator_queue { struct elevator_type *type; struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface */ extern enum elv_merge elv_merge(struct request_queue *, struct request **, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); void elevator_init_mq(struct request_queue *q); /* * io scheduler registration */ extern int elv_register(struct elevator_type *); extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *, struct elevator_tags *); /* * Helper functions. */ extern struct request *elv_rb_former_request(struct request_queue *, struct request *); extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); /* * rb support functions. */ extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); /* * Insertion selection */ #define ELEVATOR_INSERT_FRONT 1 #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) void blk_mq_sched_reg_debugfs(struct request_queue *q); void blk_mq_sched_unreg_debugfs(struct request_queue *q); #endif /* _ELEVATOR_H */
286 288 4 276 260 112 15 13 1 15 66 63 13 63 62 13 16 15 3 3 15 11 13 173 96 103 58 10 10 10 10 10 4 140 93 51 1 1 10 10 7 1 1 1 1 292 307 218 290 1 10 283 264 6 207 206 286 284 276 90 1 85 10 5 234 190 1 6 1 188 4 32 153 183 10 3 8 149 7 146 2 2 148 275 68 277 277 1 1 275 271 4 1 4 261 5 265 5 2 266 11 265 11 290 130 276 290 290 13 275 10 4 18 18 18 1 17 1 1 1 1 96 1 95 95 95 95 3 95 92 92 92 95 6 4 4 4 4 47 47 39 39 4 7 9 3 10 4 6 10 6 4 68 68 68 1 1 1 1 1 1 95 14 94 4 3 2 90 46 68 95 37 1 1 25 14 35 5 14 23 2 23 2 2 2 2 2 2 2 2 37 37 22 19 3 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" static const struct INDEX_NAMES { const __le16 *name; u8 name_len; } s_index_names[INDEX_MUTEX_TOTAL] = { { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) }, { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) }, { SQ_NAME, ARRAY_SIZE(SQ_NAME) }, { SR_NAME, ARRAY_SIZE(SR_NAME) }, }; /* * cmp_fnames - Compare two names in index. * * if l1 != 0 * Both names are little endian on-disk ATTR_FILE_NAME structs. * else * key1 - cpu_str, key2 - ATTR_FILE_NAME */ static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const struct ATTR_FILE_NAME *f2 = key2; const struct ntfs_sb_info *sbi = data; const struct ATTR_FILE_NAME *f1; u16 fsize2; bool both_case; if (l2 <= offsetof(struct ATTR_FILE_NAME, name)) return -1; fsize2 = fname_full_size(f2); if (l2 < fsize2) return -1; both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase; if (!l1) { const struct le_str *s2 = (struct le_str *)&f2->name_len; /* * If names are equal (case insensitive) * try to compare it case sensitive. */ return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case); } f1 = key1; return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len, sbi->upcase, both_case); } /* * cmp_uint - $SII of $Secure and $Q of Quota */ static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const u32 *k1 = key1; const u32 *k2 = key2; if (l2 < sizeof(u32)) return -1; if (*k1 < *k2) return -1; if (*k1 > *k2) return 1; return 0; } /* * cmp_sdh - $SDH of $Secure */ static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const struct SECURITY_KEY *k1 = key1; const struct SECURITY_KEY *k2 = key2; u32 t1, t2; if (l2 < sizeof(struct SECURITY_KEY)) return -1; t1 = le32_to_cpu(k1->hash); t2 = le32_to_cpu(k2->hash); /* First value is a hash value itself. */ if (t1 < t2) return -1; if (t1 > t2) return 1; /* Second value is security Id. */ if (data) { t1 = le32_to_cpu(k1->sec_id); t2 = le32_to_cpu(k2->sec_id); if (t1 < t2) return -1; if (t1 > t2) return 1; } return 0; } /* * cmp_uints - $O of ObjId and "$R" for Reparse. */ static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const __le32 *k1 = key1; const __le32 *k2 = key2; size_t count; if ((size_t)data == 1) { /* * ni_delete_all -> ntfs_remove_reparse -> * delete all with this reference. * k1, k2 - pointers to REPARSE_KEY */ k1 += 1; // Skip REPARSE_KEY.ReparseTag k2 += 1; // Skip REPARSE_KEY.ReparseTag if (l2 <= sizeof(int)) return -1; l2 -= sizeof(int); if (l1 <= sizeof(int)) return 1; l1 -= sizeof(int); } if (l2 < sizeof(int)) return -1; for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) { u32 t1 = le32_to_cpu(*k1); u32 t2 = le32_to_cpu(*k2); if (t1 > t2) return 1; if (t1 < t2) return -1; } if (l1 > l2) return 1; if (l1 < l2) return -1; return 0; } static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root) { switch (root->type) { case ATTR_NAME: if (root->rule == NTFS_COLLATION_TYPE_FILENAME) return &cmp_fnames; break; case ATTR_ZERO: switch (root->rule) { case NTFS_COLLATION_TYPE_UINT: return &cmp_uint; case NTFS_COLLATION_TYPE_SECURITY_HASH: return &cmp_sdh; case NTFS_COLLATION_TYPE_UINTS: return &cmp_uints; default: break; } break; default: break; } return NULL; } struct bmp_buf { struct ATTRIB *b; struct mft_inode *mi; struct buffer_head *bh; ulong *buf; size_t bit; u32 nbits; u64 new_valid; }; static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit, struct bmp_buf *bbuf) { struct ATTRIB *b; size_t data_size, valid_size, vbo, off = bit >> 3; struct ntfs_sb_info *sbi = ni->mi.sbi; CLST vcn = off >> sbi->cluster_bits; struct ATTR_LIST_ENTRY *le = NULL; struct buffer_head *bh; struct super_block *sb; u32 blocksize; const struct INDEX_NAMES *in = &s_index_names[indx->type]; bbuf->bh = NULL; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, &vcn, &bbuf->mi); bbuf->b = b; if (!b) return -EINVAL; if (!b->non_res) { data_size = le32_to_cpu(b->res.data_size); if (off >= data_size) return -EINVAL; bbuf->buf = (ulong *)resident_data(b); bbuf->bit = 0; bbuf->nbits = data_size * 8; return 0; } data_size = le64_to_cpu(b->nres.data_size); if (WARN_ON(off >= data_size)) { /* Looks like filesystem error. */ return -EINVAL; } valid_size = le64_to_cpu(b->nres.valid_size); bh = ntfs_bread_run(sbi, &indx->bitmap_run, off); if (!bh) return -EIO; if (IS_ERR(bh)) return PTR_ERR(bh); bbuf->bh = bh; if (buffer_locked(bh)) __wait_on_buffer(bh); lock_buffer(bh); sb = sbi->sb; blocksize = sb->s_blocksize; vbo = off & ~(size_t)sbi->block_mask; bbuf->new_valid = vbo + blocksize; if (bbuf->new_valid <= valid_size) bbuf->new_valid = 0; else if (bbuf->new_valid > data_size) bbuf->new_valid = data_size; if (vbo >= valid_size) { memset(bh->b_data, 0, blocksize); } else if (vbo + blocksize > valid_size) { u32 voff = valid_size & sbi->block_mask; memset(bh->b_data + voff, 0, blocksize - voff); } bbuf->buf = (ulong *)bh->b_data; bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask); bbuf->nbits = 8 * blocksize; return 0; } static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty) { struct buffer_head *bh = bbuf->bh; struct ATTRIB *b = bbuf->b; if (!bh) { if (b && !b->non_res && dirty) bbuf->mi->dirty = true; return; } if (!dirty) goto out; if (bbuf->new_valid) { b->nres.valid_size = cpu_to_le64(bbuf->new_valid); bbuf->mi->dirty = true; } set_buffer_uptodate(bh); mark_buffer_dirty(bh); out: unlock_buffer(bh); put_bh(bh); } /* * indx_mark_used - Mark the bit @bit as used. */ static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err; struct bmp_buf bbuf; err = bmp_buf_get(indx, ni, bit, &bbuf); if (err) return err; __set_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); return 0; } /* * indx_mark_free - Mark the bit @bit as free. */ static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err; struct bmp_buf bbuf; err = bmp_buf_get(indx, ni, bit, &bbuf); if (err) return err; __clear_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); return 0; } /* * scan_nres_bitmap * * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap), * inode is shared locked and no ni_lock. * Use rw_semaphore for read/write access to bitmap_run. */ static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap, struct ntfs_index *indx, size_t from, bool (*fn)(const ulong *buf, u32 bit, u32 bits, size_t *ret), size_t *ret) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct super_block *sb = sbi->sb; struct runs_tree *run = &indx->bitmap_run; struct rw_semaphore *lock = &indx->run_lock; u32 nbits = sb->s_blocksize * 8; u32 blocksize = sb->s_blocksize; u64 valid_size = le64_to_cpu(bitmap->nres.valid_size); u64 data_size = le64_to_cpu(bitmap->nres.data_size); sector_t eblock = bytes_to_block(sb, data_size); size_t vbo = from >> 3; sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits; sector_t vblock = vbo >> sb->s_blocksize_bits; sector_t blen, block; CLST lcn, clen, vcn, vcn_next; size_t idx; struct buffer_head *bh; bool ok; *ret = MINUS_ONE_T; if (vblock >= eblock) return 0; from &= nbits - 1; vcn = vbo >> sbi->cluster_bits; down_read(lock); ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); up_read(lock); next_run: if (!ok) { int err; const struct INDEX_NAMES *name = &s_index_names[indx->type]; down_write(lock); err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name, name->name_len, run, vcn); up_write(lock); if (err) return err; down_read(lock); ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); up_read(lock); if (!ok) return -EINVAL; } blen = (sector_t)clen * sbi->blocks_per_cluster; block = (sector_t)lcn * sbi->blocks_per_cluster; for (; blk < blen; blk++, from = 0) { bh = ntfs_bread(sb, block + blk); if (!bh) return -EIO; vbo = (u64)vblock << sb->s_blocksize_bits; if (vbo >= valid_size) { memset(bh->b_data, 0, blocksize); } else if (vbo + blocksize > valid_size) { u32 voff = valid_size & sbi->block_mask; memset(bh->b_data + voff, 0, blocksize - voff); } if (vbo + blocksize > data_size) nbits = 8 * (data_size - vbo); ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) : false; put_bh(bh); if (ok) { *ret += 8 * vbo; return 0; } if (++vblock >= eblock) { *ret = MINUS_ONE_T; return 0; } } blk = 0; vcn_next = vcn + clen; down_read(lock); ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next; if (!ok) vcn = vcn_next; up_read(lock); goto next_run; } static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret) { size_t pos = find_next_zero_bit_le(buf, bits, bit); if (pos >= bits) return false; *ret = pos; return true; } /* * indx_find_free - Look for free bit. * * Return: -1 if no free bits. */ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit, struct ATTRIB **bitmap) { struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; const struct INDEX_NAMES *in = &s_index_names[indx->type]; int err; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; *bitmap = b; *bit = MINUS_ONE_T; if (!b->non_res) { u32 nbits = 8 * le32_to_cpu(b->res.data_size); size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0); if (pos < nbits) *bit = pos; } else { err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit); if (err) return err; } return 0; } static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret) { size_t pos = find_next_bit_le(buf, bits, bit); if (pos >= bits) return false; *ret = pos; return true; } /* * indx_used_bit - Look for used bit. * * Return: MINUS_ONE_T if no used bits. */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit) { struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; size_t from = *bit; const struct INDEX_NAMES *in = &s_index_names[indx->type]; int err; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; *bit = MINUS_ONE_T; if (!b->non_res) { u32 nbits = le32_to_cpu(b->res.data_size) * 8; size_t pos = find_next_bit_le(resident_data(b), nbits, from); if (pos < nbits) *bit = pos; } else { err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit); if (err) return err; } return 0; } /* * hdr_find_split * * Find a point at which the index allocation buffer would like to be split. * NOTE: This function should never return 'END' entry NULL returns on error. */ static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr) { size_t o; const struct NTFS_DE *e = hdr_first_de(hdr); u32 used_2 = le32_to_cpu(hdr->used) >> 1; u16 esize; if (!e || de_is_last(e)) return NULL; esize = le16_to_cpu(e->size); for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) { const struct NTFS_DE *p = e; e = Add2Ptr(hdr, o); /* We must not return END entry. */ if (de_is_last(e)) return p; esize = le16_to_cpu(e->size); } return e; } /* * hdr_insert_head - Insert some entries at the beginning of the buffer. * * It is used to insert entries into a newly-created buffer. */ static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr, const void *ins, u32 ins_bytes) { u32 to_move; struct NTFS_DE *e = hdr_first_de(hdr); u32 used = le32_to_cpu(hdr->used); if (!e) return NULL; /* Now we just make room for the inserted entries and jam it in. */ to_move = used - le32_to_cpu(hdr->de_off); memmove(Add2Ptr(e, ins_bytes), e, to_move); memcpy(e, ins, ins_bytes); hdr->used = cpu_to_le32(used + ins_bytes); return e; } /* * index_hdr_check * * return true if INDEX_HDR is valid */ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) { u32 end = le32_to_cpu(hdr->used); u32 tot = le32_to_cpu(hdr->total); u32 off = le32_to_cpu(hdr->de_off); if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || size_add(off, sizeof(struct NTFS_DE)) > end) { /* incorrect index buffer. */ return false; } return true; } /* * index_buf_check * * return true if INDEX_BUFFER seems is valid */ static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes, const CLST *vbn) { const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr; u16 fo = le16_to_cpu(rhdr->fix_off); u16 fn = le16_to_cpu(rhdr->fix_num); if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) || rhdr->sign != NTFS_INDX_SIGNATURE || fo < sizeof(struct INDEX_BUFFER) /* Check index buffer vbn. */ || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) || fo + fn * sizeof(short) >= bytes || fn != ((bytes >> SECTOR_SHIFT) + 1)) { /* incorrect index buffer. */ return false; } return index_hdr_check(&ib->ihdr, bytes - offsetof(struct INDEX_BUFFER, ihdr)); } void fnd_clear(struct ntfs_fnd *fnd) { int i; for (i = fnd->level - 1; i >= 0; i--) { struct indx_node *n = fnd->nodes[i]; if (!n) continue; put_indx_node(n); fnd->nodes[i] = NULL; } fnd->level = 0; fnd->root_de = NULL; } static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n, struct NTFS_DE *e) { int i = fnd->level; if (i < 0 || i >= ARRAY_SIZE(fnd->nodes)) return -EINVAL; fnd->nodes[i] = n; fnd->de[i] = e; fnd->level += 1; return 0; } static struct indx_node *fnd_pop(struct ntfs_fnd *fnd) { struct indx_node *n; int i = fnd->level; i -= 1; n = fnd->nodes[i]; fnd->nodes[i] = NULL; fnd->level = i; return n; } static bool fnd_is_empty(struct ntfs_fnd *fnd) { if (!fnd->level) return !fnd->root_de; return !fnd->de[fnd->level - 1]; } /* * hdr_find_e - Locate an entry the index buffer. * * If no matching entry is found, it returns the first entry which is greater * than the desired entry If the search key is greater than all the entries the * buffer, it returns the 'end' entry. This function does a binary search of the * current index buffer, for the first entry that is <= to the search value. * * Return: NULL if error. */ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; int min_idx = 0, mid_idx, max_idx = 0; int diff2; int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); u32 total = le32_to_cpu(hdr->total); u16 offs[128]; if (unlikely(!cmp)) return NULL; fill_table: if (end > total) return NULL; if (size_add(off, sizeof(struct NTFS_DE)) > end) return NULL; e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) return NULL; if (!de_is_last(e)) { offs[max_idx] = off; off += e_size; max_idx++; if (max_idx < table_size) goto fill_table; max_idx--; } binary_search: e_key_len = le16_to_cpu(e->key_size); diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); if (diff2 > 0) { if (found) { min_idx = mid_idx + 1; } else { if (de_is_last(e)) return NULL; max_idx = 0; table_size = min(table_size * 2, (int)ARRAY_SIZE(offs)); goto fill_table; } } else if (diff2 < 0) { if (found) max_idx = mid_idx - 1; else max_idx--; found = e; } else { *diff = 0; return e; } if (min_idx > max_idx) { *diff = -1; return found; } mid_idx = (min_idx + max_idx) >> 1; e = Add2Ptr(hdr, offs[mid_idx]); goto binary_search; } /* * hdr_insert_de - Insert an index entry into the buffer. * * 'before' should be a pointer previously returned from hdr_find_e. */ static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx, struct INDEX_HDR *hdr, const struct NTFS_DE *de, struct NTFS_DE *before, const void *ctx) { int diff; size_t off = PtrOffset(hdr, before); u32 used = le32_to_cpu(hdr->used); u32 total = le32_to_cpu(hdr->total); u16 de_size = le16_to_cpu(de->size); /* First, check to see if there's enough room. */ if (used + de_size > total) return NULL; /* We know there's enough space, so we know we'll succeed. */ if (before) { /* Check that before is inside Index. */ if (off >= used || off < le32_to_cpu(hdr->de_off) || off + le16_to_cpu(before->size) > total) { return NULL; } goto ok; } /* No insert point is applied. Get it manually. */ before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx, &diff); if (!before) return NULL; off = PtrOffset(hdr, before); ok: /* Now we just make room for the entry and jam it in. */ memmove(Add2Ptr(before, de_size), before, used - off); hdr->used = cpu_to_le32(used + de_size); memcpy(before, de, de_size); return before; } /* * hdr_delete_de - Remove an entry from the index buffer. */ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, struct NTFS_DE *re) { u32 used = le32_to_cpu(hdr->used); u16 esize = le16_to_cpu(re->size); u32 off = PtrOffset(hdr, re); int bytes = used - (off + esize); /* check INDEX_HDR valid before using INDEX_HDR */ if (!check_index_header(hdr, le32_to_cpu(hdr->total))) return NULL; if (off >= used || esize < sizeof(struct NTFS_DE) || bytes < sizeof(struct NTFS_DE)) return NULL; hdr->used = cpu_to_le32(used - esize); memmove(re, Add2Ptr(re, esize), bytes); return re; } void indx_clear(struct ntfs_index *indx) { run_close(&indx->alloc_run); run_close(&indx->bitmap_run); } int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, const struct ATTRIB *attr, enum index_mutex_classed type) { u32 t32; const struct INDEX_ROOT *root = resident_data(attr); t32 = le32_to_cpu(attr->res.data_size); if (t32 <= offsetof(struct INDEX_ROOT, ihdr) || !index_hdr_check(&root->ihdr, t32 - offsetof(struct INDEX_ROOT, ihdr))) { goto out; } /* Check root fields. */ if (!root->index_block_clst) goto out; indx->type = type; indx->idx2vbn_bits = __ffs(root->index_block_clst); t32 = le32_to_cpu(root->index_block_size); indx->index_bits = blksize_bits(t32); /* Check index record size. */ if (t32 < sbi->cluster_size) { /* Index record is smaller than a cluster, use 512 blocks. */ if (t32 != root->index_block_clst * SECTOR_SIZE) goto out; /* Check alignment to a cluster. */ if ((sbi->cluster_size >> SECTOR_SHIFT) & (root->index_block_clst - 1)) { goto out; } indx->vbn2vbo_bits = SECTOR_SHIFT; } else { /* Index record must be a multiple of cluster size. */ if (t32 != root->index_block_clst << sbi->cluster_bits) goto out; indx->vbn2vbo_bits = sbi->cluster_bits; } init_rwsem(&indx->run_lock); indx->cmp = get_cmp_func(root); if (!indx->cmp) goto out; return 0; out: ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); return -EINVAL; } static struct indx_node *indx_new(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, const __le64 *sub_vbn) { int err; struct NTFS_DE *e; struct indx_node *r; struct INDEX_HDR *hdr; struct INDEX_BUFFER *index; u64 vbo = (u64)vbn << indx->vbn2vbo_bits; u32 bytes = 1u << indx->index_bits; u16 fn; u32 eo; r = kzalloc(sizeof(struct indx_node), GFP_NOFS); if (!r) return ERR_PTR(-ENOMEM); index = kzalloc(bytes, GFP_NOFS); if (!index) { kfree(r); return ERR_PTR(-ENOMEM); } err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb); if (err) { kfree(index); kfree(r); return ERR_PTR(err); } /* Create header. */ index->rhdr.sign = NTFS_INDX_SIGNATURE; index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28 fn = (bytes >> SECTOR_SHIFT) + 1; // 9 index->rhdr.fix_num = cpu_to_le16(fn); index->vbn = cpu_to_le64(vbn); hdr = &index->ihdr; eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8); hdr->de_off = cpu_to_le32(eo); e = Add2Ptr(hdr, eo); if (sub_vbn) { e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES; e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64)); de_set_vbn_le(e, *sub_vbn); hdr->flags = NTFS_INDEX_HDR_HAS_SUBNODES; } else { e->size = cpu_to_le16(sizeof(struct NTFS_DE)); hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE)); e->flags = NTFS_IE_LAST; } hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr)); r->index = index; return r; } struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTRIB **attr, struct mft_inode **mi) { struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *a; const struct INDEX_NAMES *in = &s_index_names[indx->type]; struct INDEX_ROOT *root; a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, mi); if (!a) return NULL; if (attr) *attr = a; root = resident_data_ex(a, sizeof(struct INDEX_ROOT)); /* length check */ if (root && offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) > le32_to_cpu(a->res.data_size)) { return NULL; } return root; } static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, struct indx_node *node, int sync) { struct INDEX_BUFFER *ib = node->index; return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync); } /* * indx_read * * If ntfs_readdir calls this function * inode is shared locked and no ni_lock. * Use rw_semaphore for read/write access to alloc_run. */ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, struct indx_node **node) { int err; struct INDEX_BUFFER *ib; struct runs_tree *run = &indx->alloc_run; struct rw_semaphore *lock = &indx->run_lock; u64 vbo = (u64)vbn << indx->vbn2vbo_bits; u32 bytes = 1u << indx->index_bits; struct indx_node *in = *node; const struct INDEX_NAMES *name; if (!in) { in = kzalloc(sizeof(struct indx_node), GFP_NOFS); if (!in) return -ENOMEM; } else { nb_put(&in->nb); } ib = in->index; if (!ib) { ib = kmalloc(bytes, GFP_NOFS); if (!ib) { err = -ENOMEM; goto out; } } down_read(lock); err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); up_read(lock); if (!err) goto ok; if (err == -E_NTFS_FIXUP) goto ok; if (err != -ENOENT) goto out; name = &s_index_names[indx->type]; down_write(lock); err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len, run, vbo, vbo + bytes); up_write(lock); if (err) goto out; down_read(lock); err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); up_read(lock); if (err == -E_NTFS_FIXUP) goto ok; if (err) goto out; ok: if (!index_buf_check(ib, bytes, &vbn)) { _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; goto out; } if (err == -E_NTFS_FIXUP) { ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0); err = 0; } /* check for index header length */ if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) > bytes) { err = -EINVAL; goto out; } in->index = ib; *node = in; out: if (err == -E_NTFS_CORRUPT) { _ntfs_bad_inode(&ni->vfs_inode); err = -EINVAL; } if (ib != in->index) kfree(ib); if (*node != in) { nb_put(&in->nb); kfree(in); } return err; } /* * indx_find - Scan NTFS directory for given entry. */ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, const void *key, size_t key_len, const void *ctx, int *diff, struct NTFS_DE **entry, struct ntfs_fnd *fnd) { int err; struct NTFS_DE *e; struct indx_node *node; if (!root) root = indx_get_root(&ni->dir, ni, NULL, NULL); if (!root) { /* Should not happen. */ return -EINVAL; } /* Check cache. */ e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de; if (e && !de_is_last(e) && !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) { *entry = e; *diff = 0; return 0; } /* Soft finder reset. */ fnd_clear(fnd); /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff); if (!e) return -EINVAL; fnd->root_de = e; for (;;) { node = NULL; if (*diff >= 0 || !de_has_vcn_ex(e)) break; /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &node); if (err) { /* io error? */ return err; } /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, diff); if (!e) { put_indx_node(node); return -EINVAL; } fnd_push(fnd, node, e); } *entry = e; return 0; } int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, struct ntfs_fnd *fnd) { int err; struct indx_node *n = NULL; struct NTFS_DE *e; size_t iter = 0; int level = fnd->level; if (!*entry) { /* Start find. */ e = hdr_first_de(&root->ihdr); if (!e) return 0; fnd_clear(fnd); fnd->root_de = e; } else if (!level) { if (de_is_last(fnd->root_de)) { *entry = NULL; return 0; } e = hdr_next_de(&root->ihdr, fnd->root_de); if (!e) return -EINVAL; fnd->root_de = e; } else { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; if (de_is_last(e)) goto pop_level; e = hdr_next_de(&n->index->ihdr, e); if (!e) return -EINVAL; fnd->de[level - 1] = e; } /* Just to avoid tree cycle. */ next_iter: if (iter++ >= 1000) return -EINVAL; while (de_has_vcn_ex(e)) { if (le16_to_cpu(e->size) < sizeof(struct NTFS_DE) + sizeof(u64)) { if (n) { fnd_pop(fnd); kfree(n); } return -EINVAL; } /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &n); if (err) return err; /* Try next level. */ e = hdr_first_de(&n->index->ihdr); if (!e) { kfree(n); return -EINVAL; } fnd_push(fnd, n, e); } if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { *entry = e; return 0; } pop_level: for (;;) { if (!de_is_last(e)) goto next_iter; /* Pop one level. */ if (n) { fnd_pop(fnd); kfree(n); } level = fnd->level; if (level) { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; } else if (fnd->root_de) { n = NULL; e = fnd->root_de; fnd->root_de = NULL; } else { *entry = NULL; return 0; } if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { *entry = e; if (!fnd->root_de) fnd->root_de = e; return 0; } } } int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, size_t *off, struct ntfs_fnd *fnd) { int err; struct indx_node *n = NULL; struct NTFS_DE *e = NULL; struct NTFS_DE *e2; size_t bit; CLST next_used_vbn; CLST next_vbn; u32 record_size = ni->mi.sbi->record_size; /* Use non sorted algorithm. */ if (!*entry) { /* This is the first call. */ e = hdr_first_de(&root->ihdr); if (!e) return 0; fnd_clear(fnd); fnd->root_de = e; /* The first call with setup of initial element. */ if (*off >= record_size) { next_vbn = (((*off - record_size) >> indx->index_bits)) << indx->idx2vbn_bits; /* Jump inside cycle 'for'. */ goto next; } /* Start enumeration from root. */ *off = 0; } else if (!fnd->root_de) return -EINVAL; for (;;) { /* Check if current entry can be used. */ if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) goto ok; if (!fnd->level) { /* Continue to enumerate root. */ if (!de_is_last(fnd->root_de)) { e = hdr_next_de(&root->ihdr, fnd->root_de); if (!e) return -EINVAL; fnd->root_de = e; continue; } /* Start to enumerate indexes from 0. */ next_vbn = 0; } else { /* Continue to enumerate indexes. */ e2 = fnd->de[fnd->level - 1]; n = fnd->nodes[fnd->level - 1]; if (!de_is_last(e2)) { e = hdr_next_de(&n->index->ihdr, e2); if (!e) return -EINVAL; fnd->de[fnd->level - 1] = e; continue; } /* Continue with next index. */ next_vbn = le64_to_cpu(n->index->vbn) + root->index_block_clst; } next: /* Release current index. */ if (n) { fnd_pop(fnd); put_indx_node(n); n = NULL; } /* Skip all free indexes. */ bit = next_vbn >> indx->idx2vbn_bits; err = indx_used_bit(indx, ni, &bit); if (err == -ENOENT || bit == MINUS_ONE_T) { /* No used indexes. */ *entry = NULL; return 0; } next_used_vbn = bit << indx->idx2vbn_bits; /* Read buffer into memory. */ err = indx_read(indx, ni, next_used_vbn, &n); if (err) return err; e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); if (!e) return -EINVAL; } ok: /* Return offset to restore enumerator if necessary. */ if (!n) { /* 'e' points in root, */ *off = PtrOffset(&root->ihdr, e); } else { /* 'e' points in index, */ *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) + record_size + PtrOffset(&n->index->ihdr, e); } *entry = e; return 0; } /* * indx_create_allocate - Create "Allocation + Bitmap" attributes. */ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; u32 data_size = 1u << indx->index_bits; u32 alloc_size = ntfs_up_cluster(sbi, data_size); CLST len = alloc_size >> sbi->cluster_bits; const struct INDEX_NAMES *in = &s_index_names[indx->type]; CLST alen; struct runs_tree run; run_init(&run); err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out; err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len, &run, 0, len, 0, &alloc, NULL, NULL); if (err) goto out1; alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size); err = ni_insert_resident(ni, ntfs3_bitmap_size(1), ATTR_BITMAP, in->name, in->name_len, &bitmap, NULL, NULL); if (err) goto out2; if (in->name == I30_NAME) { i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, alloc_size); } memcpy(&indx->alloc_run, &run, sizeof(run)); *vbn = 0; return 0; out2: mi_remove_attr(NULL, &ni->mi, alloc); out1: run_deallocate(sbi, &run, false); out: return err; } /* * indx_add_allocate - Add clusters to index. */ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { int err; size_t bit; u64 data_size; u64 bmp_size, bmp_size_v; struct ATTRIB *bmp, *alloc; struct mft_inode *mi; const struct INDEX_NAMES *in = &s_index_names[indx->type]; err = indx_find_free(indx, ni, &bit, &bmp); if (err) goto out1; if (bit != MINUS_ONE_T) { bmp = NULL; } else { if (bmp->non_res) { bmp_size = le64_to_cpu(bmp->nres.data_size); bmp_size_v = le64_to_cpu(bmp->nres.valid_size); } else { bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); } bit = bmp_size << 3; } data_size = (u64)(bit + 1) << indx->index_bits; if (bmp) { /* Increase bitmap. */ err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, ntfs3_bitmap_size(bit + 1), NULL, true, NULL); if (err) goto out1; } alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len, NULL, &mi); if (!alloc) { err = -EINVAL; if (bmp) goto out2; goto out1; } if (data_size <= le64_to_cpu(alloc->nres.data_size)) { /* Reuse index. */ goto out; } /* Increase allocation. */ err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, data_size, &data_size, true, NULL); if (err) { if (bmp) goto out2; goto out1; } if (in->name == I30_NAME) i_size_write(&ni->vfs_inode, data_size); out: *vbn = bit << indx->idx2vbn_bits; return 0; out2: /* Ops. No space? */ attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL); out1: return err; } /* * indx_insert_into_root - Attempt to insert an entry into the index root. * * @undo - True if we undoing previous remove. * If necessary, it will twiddle the index b-tree. */ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, struct NTFS_DE *root_de, const void *ctx, struct ntfs_fnd *fnd, bool undo) { int err = 0; struct NTFS_DE *e, *e0, *re; struct mft_inode *mi; struct ATTRIB *attr; struct INDEX_HDR *hdr; struct indx_node *n; CLST new_vbn; __le64 *sub_vbn, t_vbn; u16 new_de_size; u32 hdr_used, hdr_total, asize, to_move; u32 root_size, new_root_size; struct ntfs_sb_info *sbi; int ds_root; struct INDEX_ROOT *root, *a_root; /* Get the record this root placed in. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) return -EINVAL; /* * Try easy case: * hdr_insert_de will succeed if there's * room the root for the new entry. */ hdr = &root->ihdr; sbi = ni->mi.sbi; new_de_size = le16_to_cpu(new_de->size); hdr_used = le32_to_cpu(hdr->used); hdr_total = le32_to_cpu(hdr->total); asize = le32_to_cpu(attr->size); root_size = le32_to_cpu(attr->res.data_size); ds_root = new_de_size + hdr_used - hdr_total; /* If 'undo' is set then reduce requirements. */ if ((undo || asize + ds_root < sbi->max_bytes_per_attr) && mi_resize_attr(mi, attr, ds_root)) { hdr->total = cpu_to_le32(hdr_total + ds_root); e = hdr_insert_de(indx, hdr, new_de, root_de, ctx); WARN_ON(!e); fnd_clear(fnd); fnd->root_de = e; return 0; } /* Make a copy of root attribute to restore if error. */ a_root = kmemdup(attr, asize, GFP_NOFS); if (!a_root) return -ENOMEM; /* * Copy all the non-end entries from * the index root to the new buffer. */ to_move = 0; e0 = hdr_first_de(hdr); /* Calculate the size to copy. */ for (e = e0;; e = hdr_next_de(hdr, e)) { if (!e) { err = -EINVAL; goto out_free_root; } if (de_is_last(e)) break; to_move += le16_to_cpu(e->size); } if (!to_move) { re = NULL; } else { re = kmemdup(e0, to_move, GFP_NOFS); if (!re) { err = -ENOMEM; goto out_free_root; } } sub_vbn = NULL; if (de_has_vcn(e)) { t_vbn = de_get_vbn_le(e); sub_vbn = &t_vbn; } new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) + sizeof(u64); ds_root = new_root_size - root_size; if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) { /* Make root external. */ err = -EOPNOTSUPP; goto out_free_re; } if (ds_root) mi_resize_attr(mi, attr, ds_root); /* Fill first entry (vcn will be set later). */ e = (struct NTFS_DE *)(root + 1); memset(e, 0, sizeof(struct NTFS_DE)); e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST; hdr->flags = NTFS_INDEX_HDR_HAS_SUBNODES; hdr->used = hdr->total = cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr)); fnd->root_de = hdr_first_de(hdr); mi->dirty = true; /* Create alloc and bitmap attributes (if not). */ err = run_is_empty(&indx->alloc_run) ? indx_create_allocate(indx, ni, &new_vbn) : indx_add_allocate(indx, ni, &new_vbn); /* Layout of record may be changed, so rescan root. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) { /* Bug? */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); err = -EINVAL; goto out_free_re; } if (err) { /* Restore root. */ if (mi_resize_attr(mi, attr, -ds_root)) { memcpy(attr, a_root, asize); } else { /* Bug? */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } goto out_free_re; } e = (struct NTFS_DE *)(root + 1); *(__le64 *)(e + 1) = cpu_to_le64(new_vbn); mi->dirty = true; /* Now we can create/format the new buffer and copy the entries into. */ n = indx_new(indx, ni, new_vbn, sub_vbn); if (IS_ERR(n)) { err = PTR_ERR(n); goto out_free_re; } hdr = &n->index->ihdr; hdr_used = le32_to_cpu(hdr->used); hdr_total = le32_to_cpu(hdr->total); /* Copy root entries into new buffer. */ hdr_insert_head(hdr, re, to_move); /* Update bitmap attribute. */ indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); /* Check if we can insert new entry new index buffer. */ if (hdr_used + new_de_size > hdr_total) { /* * This occurs if MFT record is the same or bigger than index * buffer. Move all root new index and have no space to add * new entry classic case when MFT record is 1K and index * buffer 4K the problem should not occurs. */ kfree(re); indx_write(indx, ni, n, 0); put_indx_node(n); fnd_clear(fnd); err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo); goto out_free_root; } /* * Now root is a parent for new index buffer. * Insert NewEntry a new buffer. */ e = hdr_insert_de(indx, hdr, new_de, NULL, ctx); if (!e) { err = -EINVAL; goto out_put_n; } fnd_push(fnd, n, e); /* Just write updates index into disk. */ indx_write(indx, ni, n, 0); n = NULL; out_put_n: put_indx_node(n); out_free_re: kfree(re); out_free_root: kfree(a_root); return err; } /* * indx_insert_into_buffer * * Attempt to insert an entry into an Index Allocation Buffer. * If necessary, it will split the buffer. */ static int indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, struct INDEX_ROOT *root, const struct NTFS_DE *new_de, const void *ctx, int level, struct ntfs_fnd *fnd) { int err; const struct NTFS_DE *sp; struct NTFS_DE *e, *de_t, *up_e; struct indx_node *n2; struct indx_node *n1 = fnd->nodes[level]; struct INDEX_HDR *hdr1 = &n1->index->ihdr; struct INDEX_HDR *hdr2; u32 to_copy, used, used1; CLST new_vbn; __le64 t_vbn, *sub_vbn; u16 sp_size; void *hdr1_saved = NULL; /* Try the most easy case. */ e = fnd->level - 1 == level ? fnd->de[level] : NULL; e = hdr_insert_de(indx, hdr1, new_de, e, ctx); fnd->de[level] = e; if (e) { /* Just write updated index into disk. */ indx_write(indx, ni, n1, 0); return 0; } /* * No space to insert into buffer. Split it. * To split we: * - Save split point ('cause index buffers will be changed) * - Allocate NewBuffer and copy all entries <= sp into new buffer * - Remove all entries (sp including) from TargetBuffer * - Insert NewEntry into left or right buffer (depending on sp <=> * NewEntry) * - Insert sp into parent buffer (or root) * - Make sp a parent for new buffer */ sp = hdr_find_split(hdr1); if (!sp) return -EINVAL; sp_size = le16_to_cpu(sp->size); up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS); if (!up_e) return -ENOMEM; memcpy(up_e, sp, sp_size); used1 = le32_to_cpu(hdr1->used); hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS); if (!hdr1_saved) { err = -ENOMEM; goto out; } if (!hdr1->flags) { up_e->flags |= NTFS_IE_HAS_SUBNODES; up_e->size = cpu_to_le16(sp_size + sizeof(u64)); sub_vbn = NULL; } else { t_vbn = de_get_vbn_le(up_e); sub_vbn = &t_vbn; } /* Allocate on disk a new index allocation buffer. */ err = indx_add_allocate(indx, ni, &new_vbn); if (err) goto out; /* Allocate and format memory a new index buffer. */ n2 = indx_new(indx, ni, new_vbn, sub_vbn); if (IS_ERR(n2)) { err = PTR_ERR(n2); goto out; } hdr2 = &n2->index->ihdr; /* Make sp a parent for new buffer. */ de_set_vbn(up_e, new_vbn); /* Copy all the entries <= sp into the new buffer. */ de_t = hdr_first_de(hdr1); to_copy = PtrOffset(de_t, sp); hdr_insert_head(hdr2, de_t, to_copy); /* Remove all entries (sp including) from hdr1. */ used = used1 - to_copy - sp_size; memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); hdr1->used = cpu_to_le32(used); /* * Insert new entry into left or right buffer * (depending on sp <=> new_de). */ hdr_insert_de(indx, (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), up_e + 1, le16_to_cpu(up_e->key_size), ctx) < 0 ? hdr2 : hdr1, new_de, NULL, ctx); indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); indx_write(indx, ni, n1, 0); indx_write(indx, ni, n2, 0); put_indx_node(n2); /* * We've finished splitting everybody, so we are ready to * insert the promoted entry into the parent. */ if (!level) { /* Insert in root. */ err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); } else { /* * The target buffer's parent is another index buffer. * TODO: Remove recursion. */ err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, level - 1, fnd); } if (err) { /* * Undo critical operations. */ indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits); memcpy(hdr1, hdr1_saved, used1); indx_write(indx, ni, n1, 0); } out: kfree(up_e); kfree(hdr1_saved); return err; } /* * indx_insert_entry - Insert new entry into index. * * @undo - True if we undoing previous remove. */ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, const void *ctx, struct ntfs_fnd *fnd, bool undo) { int err; int diff; struct NTFS_DE *e; struct ntfs_fnd *fnd_a = NULL; struct INDEX_ROOT *root; if (!fnd) { fnd_a = fnd_get(); if (!fnd_a) { err = -ENOMEM; goto out1; } fnd = fnd_a; } root = indx_get_root(indx, ni, NULL, NULL); if (!root) { err = -EINVAL; goto out; } if (fnd_is_empty(fnd)) { /* * Find the spot the tree where we want to * insert the new entry. */ err = indx_find(indx, ni, root, new_de + 1, le16_to_cpu(new_de->key_size), ctx, &diff, &e, fnd); if (err) goto out; if (!diff) { err = -EEXIST; goto out; } } if (!fnd->level) { /* * The root is also a leaf, so we'll insert the * new entry into it. */ err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, fnd, undo); } else { /* * Found a leaf buffer, so we'll insert the new entry into it. */ err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, fnd->level - 1, fnd); } out: fnd_put(fnd_a); out1: return err; } /* * indx_find_buffer - Locate a buffer from the tree. */ static struct indx_node *indx_find_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, __le64 vbn, struct indx_node *n) { int err; const struct NTFS_DE *e; struct indx_node *r; const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr; /* Step 1: Scan one level. */ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { if (!e) return ERR_PTR(-EINVAL); if (de_has_vcn(e) && vbn == de_get_vbn_le(e)) return n; if (de_is_last(e)) break; } /* Step2: Do recursion. */ e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off)); for (;;) { if (de_has_vcn_ex(e)) { err = indx_read(indx, ni, de_get_vbn(e), &n); if (err) return ERR_PTR(err); r = indx_find_buffer(indx, ni, root, vbn, n); if (r) return r; } if (de_is_last(e)) break; e = Add2Ptr(e, le16_to_cpu(e->size)); } return NULL; } /* * indx_shrink - Deallocate unused tail indexes. */ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err = 0; u64 bpb, new_data; size_t nbits; struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; const struct INDEX_NAMES *in = &s_index_names[indx->type]; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; if (!b->non_res) { unsigned long pos; const unsigned long *bm = resident_data(b); nbits = (size_t)le32_to_cpu(b->res.data_size) * 8; if (bit >= nbits) return 0; pos = find_next_bit_le(bm, nbits, bit); if (pos < nbits) return 0; } else { size_t used = MINUS_ONE_T; nbits = le64_to_cpu(b->nres.data_size) * 8; if (bit >= nbits) return 0; err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used); if (err) return err; if (used != MINUS_ONE_T) return 0; } new_data = (u64)bit << indx->index_bits; err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, new_data, &new_data, false, NULL); if (err) return err; if (in->name == I30_NAME) i_size_write(&ni->vfs_inode, new_data); bpb = ntfs3_bitmap_size(bit); if (bpb * 8 == nbits) return 0; err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, bpb, &bpb, false, NULL); return err; } static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *e, bool trim) { int err; struct indx_node *n = NULL; struct INDEX_HDR *hdr; CLST vbn = de_get_vbn(e); size_t i; err = indx_read(indx, ni, vbn, &n); if (err) return err; hdr = &n->index->ihdr; /* First, recurse into the children, if any. */ if (hdr_has_subnode(hdr)) { for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) { indx_free_children(indx, ni, e, false); if (de_is_last(e)) break; } } put_indx_node(n); i = vbn >> indx->idx2vbn_bits; /* * We've gotten rid of the children; add this buffer to the free list. */ indx_mark_free(indx, ni, i); if (!trim) return 0; /* * If there are no used indexes after current free index * then we can truncate allocation and bitmap. * Use bitmap to estimate the case. */ indx_shrink(indx, ni, i + 1); return 0; } /* * indx_get_entry_to_replace * * Find a replacement entry for a deleted entry. * Always returns a node entry: * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn. */ static int indx_get_entry_to_replace(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *de_next, struct NTFS_DE **de_to_replace, struct ntfs_fnd *fnd) { int err; int level = -1; CLST vbn; struct NTFS_DE *e, *te, *re; struct indx_node *n; struct INDEX_BUFFER *ib; *de_to_replace = NULL; /* Find first leaf entry down from de_next. */ vbn = de_get_vbn(de_next); for (;;) { n = NULL; err = indx_read(indx, ni, vbn, &n); if (err) goto out; e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); if (!e) { err = -EINVAL; goto out; } if (!de_is_last(e)) { /* * This buffer is non-empty, so its first entry * could be used as the replacement entry. */ level = fnd->level - 1; } if (!de_has_vcn(e)) break; /* This buffer is a node. Continue to go down. */ vbn = de_get_vbn(e); } if (level == -1) goto out; n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); if (!te) { err = -EINVAL; goto out; } /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { err = -ENOMEM; goto out; } *de_to_replace = re; memcpy(re, te, le16_to_cpu(te->size)); if (!de_has_vcn(re)) { /* * The replacement entry we found doesn't have a sub_vcn. * increase its size to hold one. */ le16_add_cpu(&re->size, sizeof(u64)); re->flags |= NTFS_IE_HAS_SUBNODES; } else { /* * The replacement entry we found was a node entry, which * means that all its child buffers are empty. Return them * to the free pool. */ indx_free_children(indx, ni, te, true); } /* * Expunge the replacement entry from its former location, * and then write that buffer. */ ib = n->index; e = hdr_delete_de(&ib->ihdr, te); fnd->de[level] = e; indx_write(indx, ni, n, 0); if (ib_is_leaf(ib) && ib_is_empty(ib)) { /* An empty leaf. */ return 0; } out: fnd_clear(fnd); return err; } /* * indx_delete_entry - Delete an entry from the index. */ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const void *key, u32 key_len, const void *ctx) { int err, diff; struct INDEX_ROOT *root; struct INDEX_HDR *hdr; struct ntfs_fnd *fnd, *fnd2; struct INDEX_BUFFER *ib; struct NTFS_DE *e, *re, *next, *prev, *me; struct indx_node *n, *n2d = NULL; __le64 sub_vbn; int level, level2; struct ATTRIB *attr; struct mft_inode *mi; u32 e_size, root_size, new_root_size; size_t trim_bit; const struct INDEX_NAMES *in; fnd = fnd_get(); if (!fnd) { err = -ENOMEM; goto out2; } fnd2 = fnd_get(); if (!fnd2) { err = -ENOMEM; goto out1; } root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } /* Locate the entry to remove. */ err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd); if (err) goto out; if (!e || diff) { err = -ENOENT; goto out; } level = fnd->level; if (level) { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; ib = n->index; hdr = &ib->ihdr; } else { hdr = &root->ihdr; e = fnd->root_de; n = NULL; } e_size = le16_to_cpu(e->size); if (!de_has_vcn_ex(e)) { /* The entry to delete is a leaf, so we can just rip it out. */ hdr_delete_de(hdr, e); if (!level) { hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); goto out; } indx_write(indx, ni, n, 0); /* * Check to see if removing that entry made * the leaf empty. */ if (ib_is_leaf(ib) && ib_is_empty(ib)) { fnd_pop(fnd); fnd_push(fnd2, n, e); } } else { /* * The entry we wish to delete is a node buffer, so we * have to find a replacement for it. */ next = de_get_next(e); err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2); if (err) goto out; if (re) { de_set_vbn_le(re, de_get_vbn_le(e)); hdr_delete_de(hdr, e); err = level ? indx_insert_into_buffer(indx, ni, root, re, ctx, fnd->level - 1, fnd) : indx_insert_into_root(indx, ni, re, e, ctx, fnd, 0); kfree(re); if (err) goto out; } else { /* * There is no replacement for the current entry. * This means that the subtree rooted at its node * is empty, and can be deleted, which turn means * that the node can just inherit the deleted * entry sub_vcn. */ indx_free_children(indx, ni, next, true); de_set_vbn_le(next, de_get_vbn_le(e)); hdr_delete_de(hdr, e); if (level) { indx_write(indx, ni, n, 0); } else { hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); } } } /* Delete a branch of tree. */ if (!fnd2 || !fnd2->level) goto out; /* Reinit root 'cause it can be changed. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } n2d = NULL; sub_vbn = fnd2->nodes[0]->index->vbn; level2 = 0; level = fnd->level; hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr; /* Scan current level. */ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { if (!e) { err = -EINVAL; goto out; } if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) break; if (de_is_last(e)) { e = NULL; break; } } if (!e) { /* Do slow search from root. */ struct indx_node *in; fnd_clear(fnd); in = indx_find_buffer(indx, ni, root, sub_vbn, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); goto out; } if (in) fnd_push(fnd, in, NULL); } /* Merge fnd2 -> fnd. */ for (level = 0; level < fnd2->level; level++) { fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]); fnd2->nodes[level] = NULL; } fnd2->level = 0; hdr = NULL; for (level = fnd->level; level; level--) { struct indx_node *in = fnd->nodes[level - 1]; ib = in->index; if (ib_is_empty(ib)) { sub_vbn = ib->vbn; } else { hdr = &ib->ihdr; n2d = in; level2 = level; break; } } if (!hdr) hdr = &root->ihdr; e = hdr_first_de(hdr); if (!e) { err = -EINVAL; goto out; } if (hdr != &root->ihdr || !de_is_last(e)) { prev = NULL; while (!de_is_last(e)) { if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) break; prev = e; e = hdr_next_de(hdr, e); if (!e) { err = -EINVAL; goto out; } } if (sub_vbn != de_get_vbn_le(e)) { /* * Didn't find the parent entry, although this buffer * is the parent trail. Something is corrupt. */ err = -EINVAL; goto out; } if (de_is_last(e)) { /* * Since we can't remove the end entry, we'll remove * its predecessor instead. This means we have to * transfer the predecessor's sub_vcn to the end entry. * Note: This index block is not empty, so the * predecessor must exist. */ if (!prev) { err = -EINVAL; goto out; } if (de_has_vcn(prev)) { de_set_vbn_le(e, de_get_vbn_le(prev)); } else if (de_has_vcn(e)) { le16_sub_cpu(&e->size, sizeof(u64)); e->flags &= ~NTFS_IE_HAS_SUBNODES; le32_sub_cpu(&hdr->used, sizeof(u64)); } e = prev; } /* * Copy the current entry into a temporary buffer (stripping * off its down-pointer, if any) and delete it from the current * buffer or root, as appropriate. */ e_size = le16_to_cpu(e->size); me = kmemdup(e, e_size, GFP_NOFS); if (!me) { err = -ENOMEM; goto out; } if (de_has_vcn(me)) { me->flags &= ~NTFS_IE_HAS_SUBNODES; le16_sub_cpu(&me->size, sizeof(u64)); } hdr_delete_de(hdr, e); if (hdr == &root->ihdr) { level = 0; hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); } else { indx_write(indx, ni, n2d, 0); level = level2; } /* Mark unused buffers as free. */ trim_bit = -1; for (; level < fnd->level; level++) { ib = fnd->nodes[level]->index; if (ib_is_empty(ib)) { size_t k = le64_to_cpu(ib->vbn) >> indx->idx2vbn_bits; indx_mark_free(indx, ni, k); if (k < trim_bit) trim_bit = k; } } fnd_clear(fnd); /*fnd->root_de = NULL;*/ /* * Re-insert the entry into the tree. * Find the spot the tree where we want to insert the new entry. */ err = indx_insert_entry(indx, ni, me, ctx, fnd, 0); kfree(me); if (err) goto out; if (trim_bit != -1) indx_shrink(indx, ni, trim_bit); } else { /* * This tree needs to be collapsed down to an empty root. * Recreate the index root as an empty leaf and free all * the bits the index allocation bitmap. */ fnd_clear(fnd); fnd_clear(fnd2); in = &s_index_names[indx->type]; err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) i_size_write(&ni->vfs_inode, 0); err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); run_close(&indx->alloc_run); err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, 0, NULL, false, NULL); err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len, false, NULL); run_close(&indx->bitmap_run); root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } root_size = le32_to_cpu(attr->res.data_size); new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); if (new_root_size != root_size && !mi_resize_attr(mi, attr, new_root_size - root_size)) { err = -EINVAL; goto out; } /* Fill first entry. */ e = (struct NTFS_DE *)(root + 1); e->ref.low = 0; e->ref.high = 0; e->ref.seq = 0; e->size = cpu_to_le16(sizeof(struct NTFS_DE)); e->flags = NTFS_IE_LAST; // 0x02 e->key_size = 0; e->res = 0; hdr = &root->ihdr; hdr->flags = 0; hdr->used = hdr->total = cpu_to_le32( new_root_size - offsetof(struct INDEX_ROOT, ihdr)); mi->dirty = true; } out: fnd_put(fnd2); out1: fnd_put(fnd); out2: return err; } /* * Update duplicated information in directory entry * 'dup' - info from MFT record */ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, const struct ATTR_FILE_NAME *fname, const struct NTFS_DUP_INFO *dup, int sync) { int err, diff; struct NTFS_DE *e = NULL; struct ATTR_FILE_NAME *e_fname; struct ntfs_fnd *fnd; struct INDEX_ROOT *root; struct mft_inode *mi; struct ntfs_index *indx = &ni->dir; fnd = fnd_get(); if (!fnd) return -ENOMEM; root = indx_get_root(indx, ni, NULL, &mi); if (!root) { err = -EINVAL; goto out; } /* Find entry in directory. */ err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi, &diff, &e, fnd); if (err) goto out; if (!e) { err = -EINVAL; goto out; } if (diff) { err = -EINVAL; goto out; } e_fname = (struct ATTR_FILE_NAME *)(e + 1); if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) { /* * Nothing to update in index! Try to avoid this call. */ goto out; } memcpy(&e_fname->dup, dup, sizeof(*dup)); if (fnd->level) { /* Directory entry in index. */ err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync); } else { /* Directory entry in directory MFT record. */ mi->dirty = true; if (sync) err = mi_write(mi, 1); else mark_inode_dirty(&ni->vfs_inode); } out: fnd_put(fnd); return err; }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0-only /* DVB USB framework compliant Linux driver for the Hauppauge WinTV-NOVA-T usb2 * DVB-T receiver. * * Copyright (C) 2004-5 Patrick Boettcher (patrick.boettcher@posteo.de) * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #include "dibusb.h" static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=rc,2=eeprom (|-able))." DVB_USB_DEBUG_STATUS); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); #define deb_rc(args...) dprintk(debug,0x01,args) #define deb_ee(args...) dprintk(debug,0x02,args) /* Hauppauge NOVA-T USB2 keys */ static struct rc_map_table rc_map_haupp_table[] = { { 0x1e00, KEY_0 }, { 0x1e01, KEY_1 }, { 0x1e02, KEY_2 }, { 0x1e03, KEY_3 }, { 0x1e04, KEY_4 }, { 0x1e05, KEY_5 }, { 0x1e06, KEY_6 }, { 0x1e07, KEY_7 }, { 0x1e08, KEY_8 }, { 0x1e09, KEY_9 }, { 0x1e0a, KEY_KPASTERISK }, { 0x1e0b, KEY_RED }, { 0x1e0c, KEY_RADIO }, { 0x1e0d, KEY_MENU }, { 0x1e0e, KEY_GRAVE }, /* # */ { 0x1e0f, KEY_MUTE }, { 0x1e10, KEY_VOLUMEUP }, { 0x1e11, KEY_VOLUMEDOWN }, { 0x1e12, KEY_CHANNEL }, { 0x1e14, KEY_UP }, { 0x1e15, KEY_DOWN }, { 0x1e16, KEY_LEFT }, { 0x1e17, KEY_RIGHT }, { 0x1e18, KEY_VIDEO }, { 0x1e19, KEY_AUDIO }, { 0x1e1a, KEY_IMAGES }, { 0x1e1b, KEY_EPG }, { 0x1e1c, KEY_TV }, { 0x1e1e, KEY_NEXT }, { 0x1e1f, KEY_BACK }, { 0x1e20, KEY_CHANNELUP }, { 0x1e21, KEY_CHANNELDOWN }, { 0x1e24, KEY_LAST }, /* Skip backwards */ { 0x1e25, KEY_OK }, { 0x1e29, KEY_BLUE}, { 0x1e2e, KEY_GREEN }, { 0x1e30, KEY_PAUSE }, { 0x1e32, KEY_REWIND }, { 0x1e34, KEY_FASTFORWARD }, { 0x1e35, KEY_PLAY }, { 0x1e36, KEY_STOP }, { 0x1e37, KEY_RECORD }, { 0x1e38, KEY_YELLOW }, { 0x1e3b, KEY_GOTO }, { 0x1e3d, KEY_POWER }, }; /* Firmware bug? sometimes, when a new key is pressed, the previous pressed key * is delivered. No workaround yet, maybe a new firmware. */ static int nova_t_rc_query(struct dvb_usb_device *d, u32 *event, int *state) { u8 *buf, data, toggle, custom; u16 raw; int i, ret; struct dibusb_device_state *st = d->priv; buf = kmalloc(5, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = DIBUSB_REQ_POLL_REMOTE; buf[1] = 0x35; ret = dvb_usb_generic_rw(d, buf, 2, buf, 5, 0); if (ret < 0) goto ret; *state = REMOTE_NO_KEY_PRESSED; switch (buf[0]) { case DIBUSB_RC_HAUPPAUGE_KEY_PRESSED: raw = ((buf[1] << 8) | buf[2]) >> 3; toggle = !!(raw & 0x800); data = raw & 0x3f; custom = (raw >> 6) & 0x1f; deb_rc("raw key code 0x%02x, 0x%02x, 0x%02x to c: %02x d: %02x toggle: %d\n", buf[1], buf[2], buf[3], custom, data, toggle); for (i = 0; i < ARRAY_SIZE(rc_map_haupp_table); i++) { if (rc5_data(&rc_map_haupp_table[i]) == data && rc5_custom(&rc_map_haupp_table[i]) == custom) { deb_rc("c: %x, d: %x\n", rc5_data(&rc_map_haupp_table[i]), rc5_custom(&rc_map_haupp_table[i])); *event = rc_map_haupp_table[i].keycode; *state = REMOTE_KEY_PRESSED; if (st->old_toggle == toggle) { if (st->last_repeat_count++ < 2) *state = REMOTE_NO_KEY_PRESSED; } else { st->last_repeat_count = 0; st->old_toggle = toggle; } break; } } break; case DIBUSB_RC_HAUPPAUGE_KEY_EMPTY: default: break; } ret: kfree(buf); return ret; } static int nova_t_read_mac_address (struct dvb_usb_device *d, u8 mac[6]) { int i, ret; u8 b; mac[0] = 0x00; mac[1] = 0x0d; mac[2] = 0xfe; /* this is a complete guess, but works for my box */ for (i = 136; i < 139; i++) { ret = dibusb_read_eeprom_byte(d, i, &b); if (ret) return ret; mac[5 - (i - 136)] = b; } return 0; } /* USB Driver stuff */ static struct dvb_usb_device_properties nova_t_properties; static int nova_t_probe(struct usb_interface *intf, const struct usb_device_id *id) { return dvb_usb_device_init(intf, &nova_t_properties, THIS_MODULE, NULL, adapter_nr); } /* do not change the order of the ID table */ enum { HAUPPAUGE_WINTV_NOVA_T_USB2_COLD, HAUPPAUGE_WINTV_NOVA_T_USB2_WARM, }; static const struct usb_device_id nova_t_table[] = { DVB_USB_DEV(HAUPPAUGE, HAUPPAUGE_WINTV_NOVA_T_USB2_COLD), DVB_USB_DEV(HAUPPAUGE, HAUPPAUGE_WINTV_NOVA_T_USB2_WARM), { } }; MODULE_DEVICE_TABLE(usb, nova_t_table); static struct dvb_usb_device_properties nova_t_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .firmware = "dvb-usb-nova-t-usb2-02.fw", .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .streaming_ctrl = dibusb2_0_streaming_ctrl, .pid_filter = dibusb_pid_filter, .pid_filter_ctrl = dibusb_pid_filter_ctrl, .frontend_attach = dibusb_dib3000mc_frontend_attach, .tuner_attach = dibusb_dib3000mc_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 7, .endpoint = 0x06, .u = { .bulk = { .buffersize = 4096, } } }, }}, .size_of_priv = sizeof(struct dibusb_state), } }, .size_of_priv = sizeof(struct dibusb_device_state), .power_ctrl = dibusb2_0_power_ctrl, .read_mac_address = nova_t_read_mac_address, .rc.legacy = { .rc_interval = 100, .rc_map_table = rc_map_haupp_table, .rc_map_size = ARRAY_SIZE(rc_map_haupp_table), .rc_query = nova_t_rc_query, }, .i2c_algo = &dibusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 1, .devices = { { "Hauppauge WinTV-NOVA-T usb2", { &nova_t_table[HAUPPAUGE_WINTV_NOVA_T_USB2_COLD], NULL }, { &nova_t_table[HAUPPAUGE_WINTV_NOVA_T_USB2_WARM], NULL }, }, { NULL }, } }; static struct usb_driver nova_t_driver = { .name = "dvb_usb_nova_t_usb2", .probe = nova_t_probe, .disconnect = dvb_usb_device_exit, .id_table = nova_t_table, }; module_usb_driver(nova_t_driver); MODULE_AUTHOR("Patrick Boettcher <patrick.boettcher@posteo.de>"); MODULE_DESCRIPTION("Hauppauge WinTV-NOVA-T usb2"); MODULE_VERSION("1.0"); MODULE_LICENSE("GPL");
785 782 785 358 771 783 785 785 789 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * */ #include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> #include <asm/insn.h> int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); BUG_ON(insn.length != 2 && insn.length != 5); return insn.length; } struct jump_label_patch { const void *code; int size; }; static struct jump_label_patch __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code, *nop; const void *addr, *dest; int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); size = arch_jump_entry_size(entry); switch (size) { case JMP8_INSN_SIZE: code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; case JMP32_INSN_SIZE: code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; default: BUG(); } if (type == JUMP_LABEL_JMP) expect = nop; else expect = code; if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) code = nop; return (struct jump_label_patch){.code = code, .size = size}; } static __always_inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that * system_state is SYSTEM_BOOTING guarantees it. It will be set to * SYSTEM_SCHEDULING before other cores are awaken and before the * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { mutex_lock(&text_mutex); __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { jump_label_transform(entry, type, 0); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* * Fallback to the non-batching mode. */ arch_jump_label_transform(entry, type); return true; } mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); }
28 13 20 28 18 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> #include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) #define BUILD_ID_FMT " %20phN" #define BUILD_ID_VAL vmlinux_build_id #else #define BUILD_ID_FMT "%s" #define BUILD_ID_VAL "" #endif /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s %s " BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version, preempt_model_str(), BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); print_scx_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(const char *log_lvl) { dump_stack_print_info(log_lvl); show_stack(NULL, NULL, log_lvl); } /** * dump_stack_lvl - dump the current task information and its stack trace * @log_lvl: log level * * Architectures can override this implementation by implementing its own. */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { bool in_panic = this_cpu_in_panic(); unsigned long flags; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs, unless this CPU is in panic. * * When in panic, non-panic CPUs are not permitted to store new * printk messages so there is no need to synchronize the output. * This avoids potential deadlock in panic() if another CPU is * holding and unable to release the printk_cpu_sync. */ if (!in_panic) printk_cpu_sync_get_irqsave(flags); __dump_stack(log_lvl); if (!in_panic) printk_cpu_sync_put_irqrestore(flags); } EXPORT_SYMBOL(dump_stack_lvl); asmlinkage __visible void dump_stack(void) { dump_stack_lvl(KERN_DEFAULT); } EXPORT_SYMBOL(dump_stack);
25 25 25 10 12 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 // SPDX-License-Identifier: GPL-2.0-or-later /* Linux driver for Philips webcam Various miscellaneous functions and tables. (C) 1999-2003 Nemosoft Unv. (C) 2004-2006 Luc Saillard (luc@saillard.org) NOTE: this version of pwc is an unofficial (modified) release of pwc & pcwx driver and thus may have bugs that are not present in the original version. Please send bug reports and support requests to <luc@saillard.org>. The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. */ #include "pwc.h" const int pwc_image_sizes[PSZ_MAX][2] = { { 128, 96 }, /* sqcif */ { 160, 120 }, /* qsif */ { 176, 144 }, /* qcif */ { 320, 240 }, /* sif */ { 352, 288 }, /* cif */ { 640, 480 }, /* vga */ }; /* x,y -> PSZ_ */ int pwc_get_size(struct pwc_device *pdev, int width, int height) { int i; /* Find the largest size supported by the camera that fits into the requested size. */ for (i = PSZ_MAX - 1; i >= 0; i--) { if (!(pdev->image_mask & (1 << i))) continue; if (pwc_image_sizes[i][0] <= width && pwc_image_sizes[i][1] <= height) return i; } /* No mode found, return the smallest mode we have */ for (i = 0; i < PSZ_MAX; i++) { if (pdev->image_mask & (1 << i)) return i; } /* Never reached there always is at least one supported mode */ return 0; } /* initialize variables depending on type and decompressor */ void pwc_construct(struct pwc_device *pdev) { if (DEVICE_USE_CODEC1(pdev->type)) { pdev->image_mask = 1 << PSZ_SQCIF | 1 << PSZ_QCIF | 1 << PSZ_CIF; pdev->vcinterface = 2; pdev->vendpoint = 4; pdev->frame_header_size = 0; pdev->frame_trailer_size = 0; } else if (DEVICE_USE_CODEC3(pdev->type)) { pdev->image_mask = 1 << PSZ_QSIF | 1 << PSZ_SIF | 1 << PSZ_VGA; pdev->vcinterface = 3; pdev->vendpoint = 5; pdev->frame_header_size = TOUCAM_HEADER_SIZE; pdev->frame_trailer_size = TOUCAM_TRAILER_SIZE; } else /* if (DEVICE_USE_CODEC2(pdev->type)) */ { pdev->image_mask = 1 << PSZ_SQCIF | 1 << PSZ_QSIF | 1 << PSZ_QCIF | 1 << PSZ_SIF | 1 << PSZ_CIF | 1 << PSZ_VGA; pdev->vcinterface = 3; pdev->vendpoint = 4; pdev->frame_header_size = 0; pdev->frame_trailer_size = 0; } }
708 66 8 8 134 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP protocol. * * Version: @(#)tcp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H #include <linux/skbuff.h> #include <linux/win_minmax.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <uapi/linux/tcp.h> static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_transport_header(skb); } static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) { return th->doff * 4; } static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_inner_transport_header(skb); } static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb) { return inner_tcp_hdr(skb)->doff * 4; } /** * skb_tcp_all_headers - Returns size of all headers for a TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb)) { * int hlen = skb_tcp_all_headers(skb); * ... */ static inline int skb_tcp_all_headers(const struct sk_buff *skb) { return skb_transport_offset(skb) + tcp_hdrlen(skb); } /** * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb) && skb->encapsulation) { * int hlen = skb_inner_tcp_all_headers(skb); * ... */ static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb) { return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); } static inline unsigned int tcp_optlen(const struct sk_buff *skb) { return (tcp_hdr(skb)->doff - 5) * 4; } /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { __le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))]; s8 len; bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; __be32 end_seq; }; struct tcp_sack_block { u32 start_seq; u32 end_seq; }; /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 3, /* SACK seen on SYN packet */ smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif } /* This is the max number of SACKS that we'll generate and process. It's safe * to increase this, although since: * size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; bool req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif u32 txhash; u32 rcv_isn; u32 snt_isn; u32 ts_off; u32 snt_tsval_first; u32 snt_tsval_last; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. */ u8 syn_tos; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; bool used_tcp_ao; #endif }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) { return (struct tcp_request_sock *)req; } static inline bool tcp_rsk_used_ao(const struct request_sock *req) { #ifndef CONFIG_TCP_AO return false; #else return tcp_rsk(req)->used_tcp_ao; #endif } #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/tcp_sock.rst. * Please update the document when adding new fields. */ /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; /* TX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_tx); u32 max_window; /* Maximal window ever seen from peer */ u32 rcv_ssthresh; /* Current window clamp */ u32 reordering; /* Packet reordering metric. */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *retransmit_skb_hint; __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_txrx); u32 tsoffset; /* timestamp offset */ u32 snd_wnd; /* The window we expect to receive */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 snd_cwnd; /* Sending congestion window */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ u8 scaling_ratio; /* see tcp_win_from_space() */ u8 chrono_type : 2, /* current chronograph type */ repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ __cacheline_group_end(tcp_sock_read_txrx); /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ u32 retrans_out; /* Retransmitted packets out */ u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned; u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut * total number of data bytes sent. */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ u8 ecn_flags; /* ECN status bits. */ __cacheline_group_end(tcp_sock_write_tx); /* TXRX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_txrx); /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 rcv_nxt; /* What we want to receive next */ u32 snd_nxt; /* Next sequence we send */ u32 snd_una; /* First byte we want an ack for */ u32 window_clamp; /* Maximal window to advertise */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 packets_out; /* Packets which are "in flight" */ u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ __cacheline_group_begin(tcp_sock_write_rx) __aligned(8); u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 max_packets_out; /* max packets_out in last window */ u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ struct { u32 rtt_us; u32 seq; u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { int space; u32 seq; u64 time; } rcvq_space; __cacheline_group_end(tcp_sock_write_rx); /* End of Hot Path */ /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ u32 compressed_ack_rcv_nxt; struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ unused:5; u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ fastopen_client_fail:2, /* reason why fastopen failed */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 reord_seen; /* number of data packet reordering events */ /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ u32 rto_stamp; /* Start time (ms) of last CA_Loss recovery */ u16 total_rto; /* Total number of RTO timeouts, including * SYN/SYN-ACK and recurring timeouts. */ u16 total_rto_recoveries; /* Total number of RTO recoveries, * including any unfinished recovery. */ u32 total_rto_time; /* ms spent in (completed) RTO recoveries. */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Sock_ops bpf program related variables */ #ifdef CONFIG_BPF u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ u8 bpf_chg_cc_inprogress:1; /* In the middle of * bpf_setsockopt(TCP_CONGESTION), * it is to avoid the bpf_tcp_cc->init() * to recur itself by calling * bpf_setsockopt(TCP_CONGESTION, "itself"). */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif u16 timeout_rehash; /* Timeout-triggered rehash attempts */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) bool syn_smc; /* SYN includes SMC */ bool (*smc_hs_congested)(const struct sock *sk); #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; #ifdef CONFIG_TCP_MD5SIG /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; }; enum tsq_enum { TSQ_THROTTLED, TSQ_QUEUED, TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ TCP_ACK_DEFERRED, /* TX pure ack is deferred */ }; enum tsq_flags { TSQF_THROTTLED = BIT(TSQ_THROTTLED), TSQF_QUEUED = BIT(TSQ_QUEUED), TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED), TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. * Used in context of (lockless) tcp listeners. */ #define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt #define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { return (struct tcp_timewait_sock *)sk; } static inline bool tcp_passive_fastopen(const struct sock *sk) { return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, struct request_sock *req) { tp->saved_syn = req->saved_syn; req->saved_syn = NULL; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); tp->saved_syn = NULL; } static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { /* We use READ_ONCE() here because socket might not be locked. * This happens for listeners. */ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); return (user_mss && user_mss < mss) ? user_mss : mss; } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void __tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); int tcp_sock_set_maxseg(struct sock *sk, int val); static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) { return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); } #endif /* _LINUX_TCP_H */
513 8 507 468 472 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0 /* * DMA memory management for framework level HCD code (hc_driver) * * This implementation plugs in through generic "usb_bus" level methods, * and should work with all USB controllers, regardless of bus type. * * Released under the GPLv2 only. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mm.h> #include <linux/io.h> #include <linux/dma-mapping.h> #include <linux/dmapool.h> #include <linux/genalloc.h> #include <linux/usb.h> #include <linux/usb/hcd.h> /* * DMA-Coherent Buffers */ /* FIXME tune these based on pool statistics ... */ static size_t pool_max[HCD_BUFFER_POOLS] = { 32, 128, 512, 2048, }; void __init usb_init_pool_max(void) { /* * The pool_max values must never be smaller than * ARCH_DMA_MINALIGN. */ if (ARCH_DMA_MINALIGN <= 32) ; /* Original value is okay */ else if (ARCH_DMA_MINALIGN <= 64) pool_max[0] = 64; else if (ARCH_DMA_MINALIGN <= 128) pool_max[0] = 0; /* Don't use this pool */ else BUILD_BUG(); /* We don't allow this */ } /* SETUP primitives */ /** * hcd_buffer_create - initialize buffer pools * @hcd: the bus whose buffer pools are to be initialized * * Context: task context, might sleep * * Call this as part of initializing a host controller that uses the dma * memory allocators. It initializes some pools of dma-coherent memory that * will be shared by all drivers using that controller. * * Call hcd_buffer_destroy() to clean up after using those pools. * * Return: 0 if successful. A negative errno value otherwise. */ int hcd_buffer_create(struct usb_hcd *hcd) { char name[16]; int i, size; if (hcd->localmem_pool || !hcd_uses_dma(hcd)) return 0; for (i = 0; i < HCD_BUFFER_POOLS; i++) { size = pool_max[i]; if (!size) continue; snprintf(name, sizeof(name), "buffer-%d", size); hcd->pool[i] = dma_pool_create(name, hcd->self.sysdev, size, size, 0); if (!hcd->pool[i]) { hcd_buffer_destroy(hcd); return -ENOMEM; } } return 0; } /** * hcd_buffer_destroy - deallocate buffer pools * @hcd: the bus whose buffer pools are to be destroyed * * Context: task context, might sleep * * This frees the buffer pools created by hcd_buffer_create(). */ void hcd_buffer_destroy(struct usb_hcd *hcd) { int i; if (!IS_ENABLED(CONFIG_HAS_DMA)) return; for (i = 0; i < HCD_BUFFER_POOLS; i++) { dma_pool_destroy(hcd->pool[i]); hcd->pool[i] = NULL; } } /* sometimes alloc/free could use kmalloc with GFP_DMA, for * better sharing and to leverage mm/slab.c intelligence. */ void *hcd_buffer_alloc( struct usb_bus *bus, size_t size, gfp_t mem_flags, dma_addr_t *dma ) { struct usb_hcd *hcd = bus_to_hcd(bus); int i; if (size == 0) return NULL; if (hcd->localmem_pool) return gen_pool_dma_alloc(hcd->localmem_pool, size, dma); /* some USB hosts just use PIO */ if (!hcd_uses_dma(hcd)) { *dma = ~(dma_addr_t) 0; return kmalloc(size, mem_flags); } for (i = 0; i < HCD_BUFFER_POOLS; i++) { if (size <= pool_max[i]) return dma_pool_alloc(hcd->pool[i], mem_flags, dma); } return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags); } void hcd_buffer_free( struct usb_bus *bus, size_t size, void *addr, dma_addr_t dma ) { struct usb_hcd *hcd = bus_to_hcd(bus); int i; if (!addr) return; if (hcd->localmem_pool) { gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size); return; } if (!hcd_uses_dma(hcd)) { kfree(addr); return; } for (i = 0; i < HCD_BUFFER_POOLS; i++) { if (size <= pool_max[i]) { dma_pool_free(hcd->pool[i], addr, dma); return; } } dma_free_coherent(hcd->self.sysdev, size, addr, dma); } void *hcd_buffer_alloc_pages(struct usb_hcd *hcd, size_t size, gfp_t mem_flags, dma_addr_t *dma) { if (size == 0) return NULL; if (hcd->localmem_pool) return gen_pool_dma_alloc_align(hcd->localmem_pool, size, dma, PAGE_SIZE); /* some USB hosts just use PIO */ if (!hcd_uses_dma(hcd)) { *dma = DMA_MAPPING_ERROR; return (void *)__get_free_pages(mem_flags, get_order(size)); } return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags); } void hcd_buffer_free_pages(struct usb_hcd *hcd, size_t size, void *addr, dma_addr_t dma) { if (!addr) return; if (hcd->localmem_pool) { gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size); return; } if (!hcd_uses_dma(hcd)) { free_pages((unsigned long)addr, get_order(size)); return; } dma_free_coherent(hcd->self.sysdev, size, addr, dma); }
4 7 1 4 4 4 7 1 7 5 7 7 7 7 7 1 7 6 1 2 8 5 1 1 2 7 8 8 10 1 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <net/sock.h> #include <linux/netlink.h> #include <linux/sock_diag.h> #include <linux/netlink_diag.h> #include <linux/rhashtable.h> #include "af_netlink.h" static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->groups == NULL) return 0; return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), nlk->groups); } static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); u32 flags = 0; if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct netlink_diag_msg *rep; struct netlink_sock *nlk = nlk_sk(sk); nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->ndiag_family = AF_NETLINK; rep->ndiag_type = sk->sk_type; rep->ndiag_protocol = sk->sk_protocol; rep->ndiag_state = sk->sk_state; rep->ndiag_ino = sk_ino; rep->ndiag_portid = nlk->portid; rep->ndiag_dst_portid = nlk->dst_portid; rep->ndiag_dst_group = nlk->dst_group; sock_diag_save_cookie(sk, rep->ndiag_cookie); if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && sk_diag_dump_groups(sk, skb)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && sk_diag_put_flags(sk, skb)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; unsigned long flags; struct sock *sk; int num = 2; int ret = 0; req = nlmsg_data(cb->nlh); if (s_num > 1) goto mc_list; num--; if (!hti) { hti = kmalloc(sizeof(*hti), GFP_KERNEL); if (!hti) return -ENOMEM; cb->args[2] = (long)hti; } if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { ret = PTR_ERR(nlsk); if (ret == -EAGAIN) { ret = 0; continue; } break; } sk = (struct sock *)nlsk; if (!net_eq(sock_net(sk), net)) continue; if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; break; } } rhashtable_walk_stop(hti); if (ret) goto done; rhashtable_walk_exit(hti); num++; mc_list: read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; break; } num++; } read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; return ret; } static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; int err = 0; req = nlmsg_data(cb->nlh); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { err = __netlink_diag_dump(skb, cb, i, s_num); if (err) break; s_num = 0; } cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } return err <= 0 ? err : skb->len; } static int netlink_diag_dump_done(struct netlink_callback *cb) { struct rhashtable_iter *hti = (void *)cb->args[2]; if (cb->args[0] == 1) rhashtable_walk_exit(hti); kfree(hti); return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct netlink_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return -EOPNOTSUPP; } static const struct sock_diag_handler netlink_diag_handler = { .owner = THIS_MODULE, .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; static int __init netlink_diag_init(void) { return sock_diag_register(&netlink_diag_handler); } static void __exit netlink_diag_exit(void) { sock_diag_unregister(&netlink_diag_handler); } module_init(netlink_diag_init); module_exit(netlink_diag_exit); MODULE_DESCRIPTION("Netlink-based socket monitoring/diagnostic interface (sock_diag)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);
103 92 56 43 55 56 71 71 43 62 70 56 119 64 316 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_H #define __KVM_X86_VMX_H #include <linux/kvm_host.h> #include <asm/kvm.h> #include <asm/intel_pt.h> #include <asm/perf_event.h> #include <asm/posted_intr.h> #include "capabilities.h" #include "../kvm_cache_regs.h" #include "pmu_intel.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" #include "../mmu.h" #include "common.h" #ifdef CONFIG_X86_64 #define MAX_NR_USER_RETURN_MSRS 7 #else #define MAX_NR_USER_RETURN_MSRS 4 #endif #define MAX_NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; struct vmx_uret_msr { bool load_into_hardware; u64 data; u64 mask; }; enum segment_cache_field { SEG_FIELD_SEL = 0, SEG_FIELD_BASE = 1, SEG_FIELD_LIMIT = 2, SEG_FIELD_AR = 3, SEG_FIELD_NR = 4 }; #define RTIT_ADDR_RANGE 4 struct pt_ctx { u64 ctl; u64 status; u64 output_base; u64 output_mask; u64 cr3_match; u64 addr_a[RTIT_ADDR_RANGE]; u64 addr_b[RTIT_ADDR_RANGE]; }; struct pt_desc { u64 ctl_bitmask; u32 num_address_ranges; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; struct pt_ctx host; struct pt_ctx guest; }; /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. */ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; /* * Cache of the guest's VMCS, existing outside of guest memory. * Loaded from guest memory during VMPTRLD. Flushed to guest * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; /* * Cache of the guest's shadow VMCS, existing outside of guest * memory. Loaded from guest memory during VM entry. Flushed * to guest memory during VM exit. */ struct vmcs12 *cached_shadow_vmcs12; /* * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer */ struct gfn_to_hva_cache shadow_vmcs12_cache; /* * GPA to HVA cache for VMCS12 */ struct gfn_to_hva_cache vmcs12_cache; /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ bool need_vmcs12_to_shadow_sync; bool dirty_vmcs12; /* * Indicates whether MSR bitmap for L2 needs to be rebuilt due to * changes in MSR bitmap for L1 or switching to a different L2. Note, * this flag can only be used reliably in conjunction with a paravirt L1 * which informs L0 whether any changes to MSR bitmap for L2 were done * on its side. */ bool force_msr_bitmap_recalc; /* * Indicates lazily loaded guest state has not yet been decached from * vmcs02. */ bool need_sync_vmcs02_to_vmcs12_rare; /* * vmcs02 has been initialized, i.e. state that is constant for * vmcs02 has been written to the backing VMCS. Initialization * is delayed until L1 actually attempts to run a nested VM. */ bool vmcs02_initialized; bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to * use it. However, VMX features available to L1 will be limited based * on what the enlightened VMCS supports. */ bool enlightened_vmcs_enabled; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; /* Pending MTF VM-exit into L1. */ bool mtf_pending; struct loaded_vmcs vmcs02; /* * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; struct hrtimer preemption_timer; u64 preemption_timer_deadline; bool has_preemption_timer_deadline; bool preemption_timer_expired; /* * Used to snapshot MSRs that are conditionally loaded on VM-Enter in * order to propagate the guest's pre-VM-Enter value into vmcs02. For * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ * userspace restores MSRs before nested state. If userspace restores * MSRs after nested state, the snapshot holds garbage, but KVM can't * detect that, and the garbage value in vmcs02 will be overwritten by * MSR restoration in any case. */ u64 pre_vmenter_debugctl; u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; u16 vpid02; u16 last_vpid; struct nested_vmx_msrs msrs; /* SMM related state */ struct { /* in VMX operation on SMM entry? */ bool vmxon; /* in guest mode on SMM entry? */ bool guest_mode; } smm; #ifdef CONFIG_KVM_HYPERV gpa_t hv_evmcs_vmptr; struct kvm_host_map hv_evmcs_map; struct hv_enlightened_vmcs *hv_evmcs; #endif }; struct vcpu_vmx { struct kvm_vcpu vcpu; struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; u32 idt_vectoring_info; ulong rflags; /* * User return MSRs are always emulated when enabled in the guest, but * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to * be loaded into hardware if those conditions aren't met. */ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_guest_kernel_gs_base; #endif u64 spec_ctrl; u32 msr_ia32_umwait_control; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested * guest (L2), it points to a different VMCS. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; } msr_autoload; struct msr_autostore { struct vmx_msrs guest; } msr_autostore; struct { int vm86_active; ulong save_rflags; struct kvm_segment segs[8]; } rmode; struct { u32 bitmask; /* 4 bits per segment (1 bit per field) */ struct kvm_save_segment { u16 selector; unsigned long base; u32 limit; u32 ar; } seg[8]; } segment_cache; int vpid; /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; /* Dynamic PLE window. */ unsigned int ple_window; bool ple_window_dirty; /* Support for PML */ #define PML_LOG_NR_ENTRIES 512 /* PML is written backwards: this is the first entry written by the CPU */ #define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1) struct page *pml_pg; /* apic deadline value in host tsc */ u64 hv_deadline_tsc; /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included * in msr_ia32_feature_control_valid_bits. */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; /* SGX Launch Control public key hash */ u64 msr_ia32_sgxlepubkeyhash[4]; u64 msr_ia32_mcu_opt_ctrl; bool disable_fb_clear; struct pt_desc pt_desc; struct lbr_desc lbr_desc; /* ve_info must be page aligned. */ struct vmx_ve_information *ve_info; }; struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ u64 *pid_table; }; static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) { return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); } static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) { return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); } static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) { return to_vt(vcpu)->exit_reason; } static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && !WARN_ON_ONCE(is_td_vcpu(vcpu))) vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); return vt->exit_qualification; } static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && !WARN_ON_ONCE(is_td_vcpu(vcpu))) vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); return vt->exit_intr_info; } void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, unsigned int flags); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { vmx_set_intercept_for_msr(vcpu, msr, type, false); } static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { vmx_set_intercept_for_msr(vcpu, msr, type, true); } u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); #define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM) static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) { WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS); val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS; vmcs_write64(GUEST_IA32_DEBUGCTL, val); } static inline u64 vmx_guest_debugctl_read(void) { return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS; } static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) { u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS)) return; vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS); } /* * Note, early Intel manuals have the write-low and read-high bitmap offsets * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always * VM-Exit. */ #define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ u32 msr) \ { \ int f = sizeof(unsigned long); \ \ if (msr <= 0x1fff) \ return bitop##_bit(msr, bitmap + base / f); \ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ return (rtype)true; \ } #define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } #define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ (VM_ENTRY_LOAD_DEBUG_CONTROLS) #ifdef CONFIG_X86_64 #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \ VM_ENTRY_IA32E_MODE) #else #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS #endif #define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \ (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ VM_ENTRY_LOAD_IA32_PAT | \ VM_ENTRY_LOAD_IA32_EFER | \ VM_ENTRY_LOAD_BNDCFGS | \ VM_ENTRY_PT_CONCEAL_PIP | \ VM_ENTRY_LOAD_IA32_RTIT_CTL) #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ VM_EXIT_ACK_INTR_ON_EXIT) #ifdef CONFIG_X86_64 #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \ VM_EXIT_HOST_ADDR_SPACE_SIZE) #else #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS #endif #define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ VM_EXIT_SAVE_IA32_PAT | \ VM_EXIT_LOAD_IA32_PAT | \ VM_EXIT_SAVE_IA32_EFER | \ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \ VM_EXIT_LOAD_IA32_EFER | \ VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ VM_EXIT_CLEAR_IA32_RTIT_CTL) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ PIN_BASED_NMI_EXITING) #define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_VIRTUAL_NMIS | \ PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) #define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ (CPU_BASED_HLT_EXITING | \ CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING | \ CPU_BASED_UNCOND_IO_EXITING | \ CPU_BASED_MOV_DR_EXITING | \ CPU_BASED_USE_TSC_OFFSETTING | \ CPU_BASED_MWAIT_EXITING | \ CPU_BASED_MONITOR_EXITING | \ CPU_BASED_INVLPG_EXITING | \ CPU_BASED_RDPMC_EXITING | \ CPU_BASED_INTR_WINDOW_EXITING) #ifdef CONFIG_X86_64 #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \ CPU_BASED_CR8_LOAD_EXITING | \ CPU_BASED_CR8_STORE_EXITING) #else #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL #endif #define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \ (CPU_BASED_RDTSC_EXITING | \ CPU_BASED_TPR_SHADOW | \ CPU_BASED_USE_IO_BITMAPS | \ CPU_BASED_MONITOR_TRAP_FLAG | \ CPU_BASED_USE_MSR_BITMAPS | \ CPU_BASED_NMI_WINDOW_EXITING | \ CPU_BASED_PAUSE_EXITING | \ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0 #define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \ (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ SECONDARY_EXEC_WBINVD_EXITING | \ SECONDARY_EXEC_ENABLE_VPID | \ SECONDARY_EXEC_ENABLE_EPT | \ SECONDARY_EXEC_UNRESTRICTED_GUEST | \ SECONDARY_EXEC_PAUSE_LOOP_EXITING | \ SECONDARY_EXEC_DESC | \ SECONDARY_EXEC_ENABLE_RDTSCP | \ SECONDARY_EXEC_ENABLE_INVPCID | \ SECONDARY_EXEC_APIC_REGISTER_VIRT | \ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_SHADOW_VMCS | \ SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_ENABLE_PML | \ SECONDARY_EXEC_TSC_SCALING | \ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ SECONDARY_EXEC_PT_USE_GPA | \ SECONDARY_EXEC_PT_CONCEAL_VMX | \ SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ SECONDARY_EXEC_ENCLS_EXITING | \ SECONDARY_EXEC_EPT_VIOLATION_VE) #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ (TERTIARY_EXEC_IPI_VIRT) #define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ { \ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ vmcs_write##bits(uname, val); \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ { \ return vmcs->controls_shadow.lname; \ } \ static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ { \ return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ { \ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ } \ static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ { \ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the * cache on demand. Other registers not listed here are synced to * the cache immediately after VM-Exit. */ #define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ (1 << VCPU_REGS_RSP) | \ (1 << VCPU_EXREG_RFLAGS) | \ (1 << VCPU_EXREG_PDPTR) | \ (1 << VCPU_EXREG_SEGMENTS) | \ (1 << VCPU_EXREG_CR0) | \ (1 << VCPU_EXREG_CR3) | \ (1 << VCPU_EXREG_CR4) | \ (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) static inline unsigned long vmx_l1_guest_owned_cr0_bits(void) { unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS; /* * CR0.WP needs to be intercepted when KVM is shadowing legacy paging * in order to construct shadow PTEs with the correct protections. * Note! CR0.WP technically can be passed through to the guest if * paging is disabled, but checking CR0.PG would generate a cyclical * dependency of sorts due to forcing the caller to ensure CR0 holds * the correct value prior to determining which CR0 bits can be owned * by L1. Keep it simple and limit the optimization to EPT. */ if (!enable_ept) bits &= ~X86_CR0_WP; return bits; } static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); } static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); } void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), GFP_KERNEL_ACCOUNT); } static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { return secondary_exec_controls_get(vmx) & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { if (!enable_ept) return true; return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr; } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) { return enable_unrestricted_guest && (!is_guest_mode(vcpu) || (secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_UNRESTRICTED_GUEST)); } bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) { return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); } void dump_vmcs(struct kvm_vcpu *vcpu); static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) { return (vmx_instr_info >> 28) & 0xf; } static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) { return lapic_in_kernel(vcpu) && enable_ipiv; } static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; } int vmx_init(void); void vmx_exit(void); #endif /* __KVM_X86_VMX_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> #include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; /** * kvm_io_device_ops are called under kvm slots_lock. * read and write handlers return 0 if the transaction has been handled, * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { int (*read)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val); int (*write)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val); void (*destructor)(struct kvm_io_device *this); }; struct kvm_io_device { const struct kvm_io_device_ops *ops; }; static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { dev->ops = ops; } static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v) { return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v) { return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } #endif /* __KVM_IODEV_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 /* SPDX-License-Identifier: GPL-2.0 */ /* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H #include <linux/bpf.h> #include <linux/workqueue.h> #include <linux/if_xdp.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <net/sock.h> #define XDP_UMEM_SG_FLAG (1 << 1) struct net_device; struct xsk_queue; struct xdp_buff; struct xdp_umem { void *addrs; u64 size; u32 headroom; u32 chunk_size; u32 chunks; u32 npgs; struct user_struct *user; refcount_t users; u8 flags; u8 tx_metadata_len; bool zc; struct page **pgs; int id; struct list_head xsk_dma_list; struct work_struct work; }; struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ atomic_t count; struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; struct xsk_buff_pool *pool; u16 queue_id; bool zc; bool sg; enum { XSK_READY = 0, XSK_BOUND, XSK_UNBOUND, } state; struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; /* record the number of tx descriptors sent by this xsk and * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs * to be given to other xsks for sending tx descriptors, thereby * preventing other XSKs from being starved. */ u32 tx_budget_spent; /* Statistics */ u64 rx_dropped; u64 rx_queue_full; /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current * packet, the partially built skb is saved here so that packet building can resume in next * call of __xsk_generic_xmit(). */ struct sk_buff *skb; struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; /* * AF_XDP TX metadata hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * void (*tmo_request_timestamp)(void *priv) * Called when AF_XDP frame requested egress timestamp. * * u64 (*tmo_fill_timestamp)(void *priv) * Called when AF_XDP frame, that had requested egress timestamp, * received a completion. The hook needs to return the actual HW timestamp. * * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) * Called when AF_XDP frame requested HW checksum offload. csum_start * indicates position where checksumming should start. * csum_offset indicates position where checksum should be stored. * * void (*tmo_request_launch_time)(u64 launch_time, void *priv) * Called when AF_XDP frame requested launch time HW offload support. * launch_time indicates the PTP time at which the device can schedule the * packet for transmission. */ struct xsk_tx_metadata_ops { void (*tmo_request_timestamp)(void *priv); u64 (*tmo_fill_timestamp)(void *priv); void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); void (*tmo_request_launch_time)(u64 launch_time, void *priv); }; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(struct list_head *flush_list); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information * to perform tx completion in the future. * @meta: pointer to AF_XDP metadata area * @compl: pointer to output struct xsk_tx_metadata_to_compl * * This function should be called by the networking device when * it prepares AF_XDP egress packet. The value of @compl should be stored * and passed to xsk_tx_metadata_complete upon TX completion. */ static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { if (!meta) return; if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) compl->tx_timestamp = &meta->completion.tx_timestamp; else compl->tx_timestamp = NULL; } /** * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission * and call appropriate xsk_tx_metadata_ops operation. * @meta: pointer to AF_XDP metadata area * @ops: pointer to struct xsk_tx_metadata_ops * @priv: pointer to driver-private aread * * This function should be called by the networking device when * it prepares AF_XDP egress packet. */ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, const struct xsk_tx_metadata_ops *ops, void *priv) { if (!meta) return; if (ops->tmo_request_launch_time) if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) ops->tmo_request_launch_time(meta->request.launch_time, priv); if (ops->tmo_request_timestamp) if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) ops->tmo_request_timestamp(priv); if (ops->tmo_request_checksum) if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) ops->tmo_request_checksum(meta->request.csum_start, meta->request.csum_offset, priv); } /** * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion * and call appropriate xsk_tx_metadata_ops operation. * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl * @ops: pointer to struct xsk_tx_metadata_ops * @priv: pointer to driver-private aread * * This function should be called by the networking device upon * AF_XDP egress completion. */ static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, const struct xsk_tx_metadata_ops *ops, void *priv) { if (!compl) return; if (!compl->tx_timestamp) return; *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); } #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; } static inline void __xsk_map_flush(struct list_head *flush_list) { } static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { } static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, const struct xsk_tx_metadata_ops *ops, void *priv) { } static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, const struct xsk_tx_metadata_ops *ops, void *priv) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */
7 10 39 4 15 7 10 3 5 1 1 18 3 7 1 6 22 4 1 17 2 7 7 441 979 22677 19878 23003 472 472 99 99 5893 5903 10935 10927 6 6 942 1034 107 941 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", current->comm); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", current->comm); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
2 19 56 56 4 2 2 3 8 1 24 4 16 8 1 8 1 9 9 54 44 1 10 10 31 2 1 2 3 16 3 3 13 1 2 1 13 1 1 1 2 5 4 2 4 4 56 8 39 11 47 2 36 13 40 9 22 11 9 2 9 9 8 1 9 1 4 3 3 4 4 1 2 2 4 3 1 1 1 1 1 5 20 1 1 18 18 53 53 28 28 28 26 26 26 2 1 27 27 9 2 1 1 10 10 24 1 27 28 1 28 35 2 27 7 4 1 24 24 2 3 3 1 26 1 1 27 16 22 3 20 5 22 3 17 4 3 4 3 21 2 1 3 13 4 12 5 5 23 4 2 1 1 8 1 1 2 4 1 3 2 5 1 1 2 3 3 1 3 1 1 1 1 1 1 1 1 2 1 1 3 1 2 1 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 // SPDX-License-Identifier: GPL-2.0-or-later /* * Network block device - make block devices work over TCP * * Note that you can not swap over this thing, yet. Seems to work but * deadlocks sometimes - you can not swap over TCP in general. * * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz> * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com> * * (part of code stolen from loop.c) */ #define pr_fmt(fmt) "nbd: " fmt #include <linux/major.h> #include <linux/blkdev.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/fs.h> #include <linux/bio.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/mutex.h> #include <linux/compiler.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/net.h> #include <linux/kthread.h> #include <linux/types.h> #include <linux/debugfs.h> #include <linux/blk-mq.h> #include <linux/uaccess.h> #include <asm/types.h> #include <linux/nbd.h> #include <linux/nbd-netlink.h> #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/nbd.h> static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); static struct workqueue_struct *nbd_del_wq; static int nbd_total_devices = 0; struct nbd_sock { struct socket *sock; struct mutex tx_lock; struct request *pending; int sent; bool dead; int fallback_index; int cookie; struct work_struct work; }; struct recv_thread_args { struct work_struct work; struct nbd_device *nbd; struct nbd_sock *nsock; int index; }; struct link_dead_args { struct work_struct work; int index; }; #define NBD_RT_TIMEDOUT 0 #define NBD_RT_DISCONNECT_REQUESTED 1 #define NBD_RT_DISCONNECTED 2 #define NBD_RT_HAS_PID_FILE 3 #define NBD_RT_HAS_CONFIG_REF 4 #define NBD_RT_BOUND 5 #define NBD_RT_DISCONNECT_ON_CLOSE 6 #define NBD_RT_HAS_BACKEND_FILE 7 #define NBD_DESTROY_ON_DISCONNECT 0 #define NBD_DISCONNECT_REQUESTED 1 struct nbd_config { u32 flags; unsigned long runtime_flags; u64 dead_conn_timeout; struct nbd_sock **socks; int num_connections; atomic_t live_connections; wait_queue_head_t conn_wait; atomic_t recv_threads; wait_queue_head_t recv_wq; unsigned int blksize_bits; loff_t bytesize; #if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbg_dir; #endif }; static inline unsigned int nbd_blksize(struct nbd_config *config) { return 1u << config->blksize_bits; } struct nbd_device { struct blk_mq_tag_set tag_set; int index; refcount_t config_refs; refcount_t refs; struct nbd_config *config; struct mutex config_lock; struct gendisk *disk; struct workqueue_struct *recv_workq; struct work_struct remove_work; struct list_head list; struct task_struct *task_setup; unsigned long flags; pid_t pid; /* pid of nbd-client, if attached */ char *backend; }; #define NBD_CMD_REQUEUED 1 /* * This flag will be set if nbd_queue_rq() succeed, and will be checked and * cleared in completion. Both setting and clearing of the flag are protected * by cmd->lock. */ #define NBD_CMD_INFLIGHT 2 /* Just part of request header or data payload is sent successfully */ #define NBD_CMD_PARTIAL_SEND 3 struct nbd_cmd { struct nbd_device *nbd; struct mutex lock; int index; int cookie; int retries; blk_status_t status; unsigned long flags; u32 cmd_cookie; }; #if IS_ENABLED(CONFIG_DEBUG_FS) static struct dentry *nbd_dbg_dir; #endif #define nbd_name(nbd) ((nbd)->disk->disk_name) #define NBD_DEF_BLKSIZE_BITS 10 static unsigned int nbds_max = 16; static int max_part = 16; static int part_shift; static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); static void nbd_config_put(struct nbd_device *nbd); static void nbd_connect_reply(struct genl_info *info, int index); static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info); static void nbd_dead_link_work(struct work_struct *work); static void nbd_disconnect_and_put(struct nbd_device *nbd); static inline struct device *nbd_to_dev(struct nbd_device *nbd) { return disk_to_dev(nbd->disk); } static void nbd_requeue_cmd(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); lockdep_assert_held(&cmd->lock); /* * Clear INFLIGHT flag so that this cmd won't be completed in * normal completion path * * INFLIGHT flag will be set when the cmd is queued to nbd next * time. */ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) blk_mq_requeue_request(req, true); } #define NBD_COOKIE_BITS 32 static u64 nbd_cmd_handle(struct nbd_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); u32 tag = blk_mq_unique_tag(req); u64 cookie = cmd->cmd_cookie; return (cookie << NBD_COOKIE_BITS) | tag; } static u32 nbd_handle_to_tag(u64 handle) { return (u32)handle; } static u32 nbd_handle_to_cookie(u64 handle) { return (u32)(handle >> NBD_COOKIE_BITS); } static const char *nbdcmd_to_ascii(int cmd) { switch (cmd) { case NBD_CMD_READ: return "read"; case NBD_CMD_WRITE: return "write"; case NBD_CMD_DISC: return "disconnect"; case NBD_CMD_FLUSH: return "flush"; case NBD_CMD_TRIM: return "trim/discard"; } return "invalid"; } static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); struct nbd_device *nbd = disk->private_data; return sprintf(buf, "%d\n", nbd->pid); } static const struct device_attribute pid_attr = { .attr = { .name = "pid", .mode = 0444}, .show = pid_show, }; static ssize_t backend_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); struct nbd_device *nbd = disk->private_data; return sprintf(buf, "%s\n", nbd->backend ?: ""); } static const struct device_attribute backend_attr = { .attr = { .name = "backend", .mode = 0444}, .show = backend_show, }; static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; del_gendisk(disk); blk_mq_free_tag_set(&nbd->tag_set); /* * Remove from idr after del_gendisk() completes, so if the same ID is * reused, the following add_disk() will succeed. */ mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, nbd->index); mutex_unlock(&nbd_index_mutex); destroy_workqueue(nbd->recv_workq); put_disk(disk); } static void nbd_dev_remove_work(struct work_struct *work) { nbd_dev_remove(container_of(work, struct nbd_device, remove_work)); } static void nbd_put(struct nbd_device *nbd) { if (!refcount_dec_and_test(&nbd->refs)) return; /* Call del_gendisk() asynchrounously to prevent deadlock */ if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) queue_work(nbd_del_wq, &nbd->remove_work); else nbd_dev_remove(nbd); } static int nbd_disconnected(struct nbd_config *config) { return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); } static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, int notify) { if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) { struct link_dead_args *args; args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO); if (args) { INIT_WORK(&args->work, nbd_dead_link_work); args->index = nbd->index; queue_work(system_wq, &args->work); } } if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); if (atomic_dec_return(&nbd->config->live_connections) == 0) { if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, &nbd->config->runtime_flags)) { set_bit(NBD_RT_DISCONNECTED, &nbd->config->runtime_flags); dev_info(nbd_to_dev(nbd), "Disconnected due to user request.\n"); } } } nsock->dead = true; nsock->pending = NULL; nsock->sent = 0; } static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { struct queue_limits lim; int error; if (!blksize) blksize = 1u << NBD_DEF_BLKSIZE_BITS; if (blk_validate_block_size(blksize)) return -EINVAL; if (bytesize < 0) return -EINVAL; nbd->config->bytesize = bytesize; nbd->config->blksize_bits = __ffs(blksize); if (!nbd->pid) return 0; lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; else lim.max_hw_discard_sectors = 0; if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) { lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; } else { lim.features |= BLK_FEAT_WRITE_CACHE; lim.features &= ~BLK_FEAT_FUA; } if (nbd->config->flags & NBD_FLAG_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim); if (error) return error; if (max_part) set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); if (!set_capacity_and_notify(nbd->disk, bytesize >> 9)) kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); return 0; } static void nbd_complete_rq(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req, cmd->status ? "failed" : "done"); blk_mq_end_request(req, cmd->status); } /* * Forcibly shutdown the socket causing all listeners to error */ static void sock_shutdown(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int i; if (config->num_connections == 0) return; if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return; for (i = 0; i < config->num_connections; i++) { struct nbd_sock *nsock = config->socks[i]; mutex_lock(&nsock->tx_lock); nbd_mark_nsock_dead(nbd, nsock, 0); mutex_unlock(&nsock->tx_lock); } dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); } static u32 req_to_nbd_cmd_type(struct request *req) { switch (req_op(req)) { case REQ_OP_DISCARD: return NBD_CMD_TRIM; case REQ_OP_FLUSH: return NBD_CMD_FLUSH; case REQ_OP_WRITE: return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; case REQ_OP_WRITE_ZEROES: return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } } static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd) { if (refcount_inc_not_zero(&nbd->config_refs)) { /* * Add smp_mb__after_atomic to ensure that reading nbd->config_refs * and reading nbd->config is ordered. The pair is the barrier in * nbd_alloc_and_init_config(), avoid nbd->config_refs is set * before nbd->config. */ smp_mb__after_atomic(); return nbd->config; } return NULL; } static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; struct nbd_config *config; if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; /* partial send is handled in nbd_sock's work function */ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_RESET_TIMER; } if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; } config = nbd_get_config_unlocked(nbd); if (!config) { cmd->status = BLK_STS_TIMEOUT; __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } if (config->num_connections > 1 || (config->num_connections == 1 && nbd->tag_set.timeout)) { dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out, retrying (%d/%d alive)\n", atomic_read(&config->live_connections), config->num_connections); /* * Hooray we have more connections, requeue this IO, the submit * path will put it on a real connection. Or if only one * connection is configured, the submit path will wait util * a new connection is reconfigured or util dead timeout. */ if (config->socks) { if (cmd->index < config->num_connections) { struct nbd_sock *nsock = config->socks[cmd->index]; mutex_lock(&nsock->tx_lock); /* We can have multiple outstanding requests, so * we don't want to mark the nsock dead if we've * already reconnected with a new socket, so * only mark it dead if its the same socket we * were sent out on. */ if (cmd->cookie == nsock->cookie) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); } nbd_requeue_cmd(cmd); mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } } if (!nbd->tag_set.timeout) { /* * Userspace sets timeout=0 to disable socket disconnection, * so just warn and reset the timer. */ struct nbd_sock *nsock = config->socks[cmd->index]; cmd->retries++; dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n", req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries); mutex_lock(&nsock->tx_lock); if (cmd->cookie != nsock->cookie) { nbd_requeue_cmd(cmd); mutex_unlock(&nsock->tx_lock); mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_DONE; } mutex_unlock(&nsock->tx_lock); mutex_unlock(&cmd->lock); nbd_config_put(nbd); return BLK_EH_RESET_TIMER; } dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); sock_shutdown(nbd); nbd_config_put(nbd); done: blk_mq_complete_request(req); return BLK_EH_DONE; } static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, struct iov_iter *iter, int msg_flags, int *sent) { int result; struct msghdr msg = {} ; unsigned int noreclaim_flag; if (unlikely(!sock)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted %s on closed socket in sock_xmit\n", (send ? "send" : "recv")); return -EINVAL; } msg.msg_iter = *iter; noreclaim_flag = memalloc_noreclaim_save(); do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; sock->sk->sk_use_task_frag = false; msg.msg_flags = msg_flags | MSG_NOSIGNAL; if (send) result = sock_sendmsg(sock, &msg); else result = sock_recvmsg(sock, &msg, msg.msg_flags); if (result <= 0) { if (result == 0) result = -EPIPE; /* short read */ break; } if (sent) *sent += result; } while (msg_data_left(&msg)); memalloc_noreclaim_restore(noreclaim_flag); return result; } /* * Send or receive packet. Return a positive value on success and * negtive value on failure, and never return 0. */ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct iov_iter *iter, int msg_flags, int *sent) { struct nbd_config *config = nbd->config; struct socket *sock = config->socks[index]->sock; return __sock_xmit(nbd, sock, send, iter, msg_flags, sent); } /* * Different settings for sk->sk_sndtimeo can result in different return values * if there is a signal pending when we enter sendmsg, because reasons? */ static inline int was_interrupted(int result) { return result == -ERESTARTSYS || result == -EINTR; } /* * We've already sent header or part of data payload, have no choice but * to set pending and schedule it in work. * * And we have to return BLK_STS_OK to block core, otherwise this same * request may be re-dispatched with different tag, but our header has * been sent out with old tag, and this way does confuse reply handling. */ static void nbd_sched_pending_work(struct nbd_device *nbd, struct nbd_sock *nsock, struct nbd_cmd *cmd, int sent) { struct request *req = blk_mq_rq_from_pdu(cmd); /* pending work should be scheduled only once */ WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)); nsock->pending = req; nsock->sent = sent; set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); refcount_inc(&nbd->config_refs); schedule_work(&nsock->work); } /* * Returns BLK_STS_RESOURCE if the caller should retry after a delay. * Returns BLK_STS_IOERR if sending failed. */ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_config *config = nbd->config; struct nbd_sock *nsock = config->socks[index]; int result; struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)}; struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; struct iov_iter from; struct bio *bio; u64 handle; u32 type; u32 nbd_cmd_flags = 0; int sent = nsock->sent, skip = 0; lockdep_assert_held(&cmd->lock); lockdep_assert_held(&nsock->tx_lock); iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request)); type = req_to_nbd_cmd_type(req); if (type == U32_MAX) return BLK_STS_IOERR; if (rq_data_dir(req) == WRITE && (config->flags & NBD_FLAG_READ_ONLY)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Write on read-only\n"); return BLK_STS_IOERR; } if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the * request. */ if (sent) { if (sent >= sizeof(request)) { skip = sent - sizeof(request); /* initialize handle for tracing purposes */ handle = nbd_cmd_handle(cmd); goto send_pages; } iov_iter_advance(&from, sent); } else { cmd->cmd_cookie++; } cmd->index = index; cmd->cookie = nsock->cookie; cmd->retries = 0; request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(blk_rq_bytes(req)); } handle = nbd_cmd_handle(cmd); request.cookie = cpu_to_be64(handle); trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", req, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, index, 1, &from, (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); trace_nbd_header_sent(req, handle); if (result < 0) { if (was_interrupted(result)) { /* If we haven't sent anything we can just return BUSY, * however if we have sent something we need to make * sure we only allow this req to be sent until we are * completely done. */ if (sent) { nbd_sched_pending_work(nbd, nsock, cmd, sent); return BLK_STS_OK; } set_bit(NBD_CMD_REQUEUED, &cmd->flags); return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); goto requeue; } send_pages: if (type != NBD_CMD_WRITE) goto out; bio = req->bio; while (bio) { struct bio *next = bio->bi_next; struct bvec_iter iter; struct bio_vec bvec; bio_for_each_segment(bvec, bio, iter) { bool is_last = !next && bio_iter_last(bvec, iter); int flags = is_last ? 0 : MSG_MORE; dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", req, bvec.bv_len); iov_iter_bvec(&from, ITER_SOURCE, &bvec, 1, bvec.bv_len); if (skip) { if (skip >= iov_iter_count(&from)) { skip -= iov_iter_count(&from); continue; } iov_iter_advance(&from, skip); skip = 0; } result = sock_xmit(nbd, index, 1, &from, flags, &sent); if (result < 0) { if (was_interrupted(result)) { nbd_sched_pending_work(nbd, nsock, cmd, sent); return BLK_STS_OK; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); goto requeue; } /* * The completion might already have come in, * so break for the last one instead of letting * the iterator do it. This prevents use-after-free * of the bio. */ if (is_last) break; } bio = next; } out: trace_nbd_payload_sent(req, handle); nsock->pending = NULL; nsock->sent = 0; __set_bit(NBD_CMD_INFLIGHT, &cmd->flags); return BLK_STS_OK; requeue: /* * Can't requeue in case we are dealing with partial send * * We must run from pending work function. * */ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) return BLK_STS_OK; /* retry on a different socket */ dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); nbd_requeue_cmd(cmd); return BLK_STS_OK; } /* handle partial sending */ static void nbd_pending_cmd_work(struct work_struct *work) { struct nbd_sock *nsock = container_of(work, struct nbd_sock, work); struct request *req = nsock->pending; struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; unsigned long deadline = READ_ONCE(req->deadline); unsigned int wait_ms = 2; mutex_lock(&cmd->lock); WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags)); if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))) goto out; mutex_lock(&nsock->tx_lock); while (true) { nbd_send_cmd(nbd, cmd, cmd->index); if (!nsock->pending) break; /* don't bother timeout handler for partial sending */ if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) { cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); break; } msleep(wait_ms); wait_ms *= 2; } mutex_unlock(&nsock->tx_lock); clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); out: mutex_unlock(&cmd->lock); nbd_config_put(nbd); } static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, struct nbd_reply *reply) { struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; struct iov_iter to; int result; reply->magic = 0; iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply)); result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL); if (result < 0) { if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); return result; } if (ntohl(reply->magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply->magic)); return -EPROTO; } return 0; } /* NULL returned = something went wrong, inform userspace */ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, struct nbd_reply *reply) { int result; struct nbd_cmd *cmd; struct request *req = NULL; u64 handle; u16 hwq; u32 tag; int ret = 0; handle = be64_to_cpu(reply->cookie); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], blk_mq_unique_tag_to_tag(tag)); if (!req || !blk_mq_request_started(req)) { dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", tag, req); return ERR_PTR(-ENOENT); } trace_nbd_header_received(req, handle); cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)", tag, cmd->status, cmd->flags); ret = -ENOENT; goto out; } if (cmd->index != index) { dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)", tag, index, cmd->index); ret = -ENOENT; goto out; } if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); ret = -ENOENT; goto out; } if (cmd->status != BLK_STS_OK) { dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n", req); ret = -ENOENT; goto out; } if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", req); ret = -ENOENT; goto out; } if (ntohl(reply->error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply->error)); cmd->status = BLK_STS_IOERR; goto out; } dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; struct iov_iter to; rq_for_each_segment(bvec, req, iter) { iov_iter_bvec(&to, ITER_DEST, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result < 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); /* * If we've disconnected, we need to make sure we * complete this request, otherwise error out * and let the timeout stuff handle resubmitting * this request onto another connection. */ if (nbd_disconnected(nbd->config)) { cmd->status = BLK_STS_IOERR; goto out; } ret = -EIO; goto out; } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", req, bvec.bv_len); } } out: trace_nbd_payload_received(req, handle); mutex_unlock(&cmd->lock); return ret ? ERR_PTR(ret) : cmd; } static void recv_work(struct work_struct *work) { struct recv_thread_args *args = container_of(work, struct recv_thread_args, work); struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; struct request_queue *q = nbd->disk->queue; struct nbd_sock *nsock = args->nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { struct nbd_reply reply; if (nbd_read_reply(nbd, nsock->sock, &reply)) break; /* * Grab .q_usage_counter so request pool won't go away, then no * request use-after-free is possible during nbd_handle_reply(). * If queue is frozen, there won't be any inflight requests, we * needn't to handle the incoming garbage message. */ if (!percpu_ref_tryget(&q->q_usage_counter)) { dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n", __func__); break; } cmd = nbd_handle_reply(nbd, args->index, &reply); if (IS_ERR(cmd)) { percpu_ref_put(&q->q_usage_counter); break; } rq = blk_mq_rq_from_pdu(cmd); if (likely(!blk_should_fake_timeout(rq->q))) { bool complete; mutex_lock(&cmd->lock); complete = __test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); if (complete) blk_mq_complete_request(rq); } percpu_ref_put(&q->q_usage_counter); } mutex_lock(&nsock->tx_lock); nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); kfree(args); } static bool nbd_clear_req(struct request *req, void *data) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); /* don't abort one completed request */ if (blk_mq_request_completed(req)) return true; mutex_lock(&cmd->lock); if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return true; } cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); blk_mq_complete_request(req); return true; } static void nbd_clear_que(struct nbd_device *nbd) { blk_mq_quiesce_queue(nbd->disk->queue); blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); blk_mq_unquiesce_queue(nbd->disk->queue); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } static int find_fallback(struct nbd_device *nbd, int index) { struct nbd_config *config = nbd->config; int new_index = -1; struct nbd_sock *nsock = config->socks[index]; int fallback = nsock->fallback_index; if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return new_index; if (config->num_connections <= 1) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Dead connection, failed to find a fallback\n"); return new_index; } if (fallback >= 0 && fallback < config->num_connections && !config->socks[fallback]->dead) return fallback; if (nsock->fallback_index < 0 || nsock->fallback_index >= config->num_connections || config->socks[nsock->fallback_index]->dead) { int i; for (i = 0; i < config->num_connections; i++) { if (i == index) continue; if (!config->socks[i]->dead) { new_index = i; break; } } nsock->fallback_index = new_index; if (new_index < 0) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Dead connection, failed to find a fallback\n"); return new_index; } } new_index = nsock->fallback_index; return new_index; } static int wait_for_reconnect(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; if (!wait_event_timeout(config->conn_wait, test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || atomic_read(&config->live_connections) > 0, config->dead_conn_timeout)) return 0; return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); } static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_device *nbd = cmd->nbd; struct nbd_config *config; struct nbd_sock *nsock; blk_status_t ret; lockdep_assert_held(&cmd->lock); config = nbd_get_config_unlocked(nbd); if (!config) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); return BLK_STS_IOERR; } if (index >= config->num_connections) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); return BLK_STS_IOERR; } cmd->status = BLK_STS_OK; again: nsock = config->socks[index]; mutex_lock(&nsock->tx_lock); if (nsock->dead) { int old_index = index; index = find_fallback(nbd, index); mutex_unlock(&nsock->tx_lock); if (index < 0) { if (wait_for_reconnect(nbd)) { index = old_index; goto again; } /* All the sockets should already be down at this point, * we just want to make sure that DISCONNECTED is set so * any requests that come in that were queue'ed waiting * for the reconnect timer don't trigger the timer again * and instead just error out. */ sock_shutdown(nbd); nbd_config_put(nbd); return BLK_STS_IOERR; } goto again; } /* Handle the case that we have a pending request that was partially * transmitted that _has_ to be serviced first. We need to call requeue * here so that it gets put _after_ the request that is already on the * dispatch list. */ blk_mq_start_request(req); if (unlikely(nsock->pending && nsock->pending != req)) { nbd_requeue_cmd(cmd); ret = BLK_STS_OK; goto out; } ret = nbd_send_cmd(nbd, cmd, index); out: mutex_unlock(&nsock->tx_lock); nbd_config_put(nbd); return ret; } static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); blk_status_t ret; /* * Since we look at the bio's to send the request over the network we * need to make sure the completion work doesn't mark this request done * before we are done doing our send. This keeps us from dereferencing * freed data if we have particularly fast completions (ie we get the * completion before we exit sock_xmit on the last bvec) or in the case * that the server is misbehaving (or there was an error) before we're * done sending everything over the wire. */ mutex_lock(&cmd->lock); clear_bit(NBD_CMD_REQUEUED, &cmd->flags); /* We can be called directly from the user space process, which means we * could possibly have signals pending so our sendmsg will fail. In * this case we need to return that we are busy, otherwise error out as * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); mutex_unlock(&cmd->lock); return ret; } static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, int *err) { struct socket *sock; *err = 0; sock = sockfd_lookup(fd, err); if (!sock) return NULL; if (sock->ops->shutdown == sock_no_shutdown) { dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); *err = -EINVAL; sockfd_put(sock); return NULL; } return sock; } static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, bool netlink) { struct nbd_config *config = nbd->config; struct socket *sock; struct nbd_sock **socks; struct nbd_sock *nsock; unsigned int memflags; int err; /* Arg will be cast to int, check it to avoid overflow */ if (arg > INT_MAX) return -EINVAL; sock = nbd_get_socket(nbd, arg, &err); if (!sock) return err; /* * We need to make sure we don't get any errant requests while we're * reallocating the ->socks array. */ memflags = blk_mq_freeze_queue(nbd->disk->queue); if (!netlink && !nbd->task_setup && !test_bit(NBD_RT_BOUND, &config->runtime_flags)) nbd->task_setup = current; if (!netlink && (nbd->task_setup != current || test_bit(NBD_RT_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); err = -EBUSY; goto put_socket; } nsock = kzalloc(sizeof(*nsock), GFP_KERNEL); if (!nsock) { err = -ENOMEM; goto put_socket; } socks = krealloc(config->socks, (config->num_connections + 1) * sizeof(struct nbd_sock *), GFP_KERNEL); if (!socks) { kfree(nsock); err = -ENOMEM; goto put_socket; } config->socks = socks; nsock->fallback_index = -1; nsock->dead = false; mutex_init(&nsock->tx_lock); nsock->sock = sock; nsock->pending = NULL; nsock->sent = 0; nsock->cookie = 0; INIT_WORK(&nsock->work, nbd_pending_cmd_work); socks[config->num_connections++] = nsock; atomic_inc(&config->live_connections); blk_mq_unfreeze_queue(nbd->disk->queue, memflags); return 0; put_socket: blk_mq_unfreeze_queue(nbd->disk->queue, memflags); sockfd_put(sock); return err; } static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) { struct nbd_config *config = nbd->config; struct socket *sock, *old; struct recv_thread_args *args; int i; int err; sock = nbd_get_socket(nbd, arg, &err); if (!sock) return err; args = kzalloc(sizeof(*args), GFP_KERNEL); if (!args) { sockfd_put(sock); return -ENOMEM; } for (i = 0; i < config->num_connections; i++) { struct nbd_sock *nsock = config->socks[i]; if (!nsock->dead) continue; mutex_lock(&nsock->tx_lock); if (!nsock->dead) { mutex_unlock(&nsock->tx_lock); continue; } sk_set_memalloc(sock->sk); if (nbd->tag_set.timeout) sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); old = nsock->sock; nsock->fallback_index = -1; nsock->sock = sock; nsock->dead = false; INIT_WORK(&args->work, recv_work); args->index = i; args->nbd = nbd; args->nsock = nsock; nsock->cookie++; mutex_unlock(&nsock->tx_lock); sockfd_put(old); clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. */ queue_work(nbd->recv_workq, &args->work); atomic_inc(&config->live_connections); wake_up(&config->conn_wait); return 0; } sockfd_put(sock); kfree(args); return -ENOSPC; } static void nbd_bdev_reset(struct nbd_device *nbd) { if (disk_openers(nbd->disk) > 1) return; set_capacity(nbd->disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) { if (nbd->config->flags & NBD_FLAG_READ_ONLY) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); } static void send_disconnects(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; struct nbd_request request = { .magic = htonl(NBD_REQUEST_MAGIC), .type = htonl(NBD_CMD_DISC), }; struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; struct iov_iter from; int i, ret; for (i = 0; i < config->num_connections; i++) { struct nbd_sock *nsock = config->socks[i]; iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); if (ret < 0) dev_err(disk_to_dev(nbd->disk), "Send disconnect failed %d\n", ret); mutex_unlock(&nsock->tx_lock); } } static int nbd_disconnect(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); send_disconnects(nbd); return 0; } static void nbd_clear_sock(struct nbd_device *nbd) { sock_shutdown(nbd); nbd_clear_que(nbd); nbd->task_setup = NULL; } static void nbd_config_put(struct nbd_device *nbd) { if (refcount_dec_and_mutex_lock(&nbd->config_refs, &nbd->config_lock)) { struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); invalidate_disk(nbd->disk); if (nbd->config->bytesize) kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->pid = 0; if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags)) { device_remove_file(disk_to_dev(nbd->disk), &backend_attr); kfree(nbd->backend); nbd->backend = NULL; } nbd_clear_sock(nbd); if (config->num_connections) { int i; for (i = 0; i < config->num_connections; i++) { sockfd_put(config->socks[i]->sock); kfree(config->socks[i]); } kfree(config->socks); } kfree(nbd->config); nbd->config = NULL; nbd->tag_set.timeout = 0; mutex_unlock(&nbd->config_lock); nbd_put(nbd); module_put(THIS_MODULE); } } static int nbd_start_device(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int num_connections = config->num_connections; int error = 0, i; if (nbd->pid) return -EBUSY; if (!config->socks) return -EINVAL; if (num_connections > 1 && !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) { dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n"); return -EINVAL; } retry: mutex_unlock(&nbd->config_lock); blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections); mutex_lock(&nbd->config_lock); /* if another code path updated nr_hw_queues, retry until succeed */ if (num_connections != config->num_connections) { num_connections = config->num_connections; goto retry; } nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); error = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (error) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n"); return error; } set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { struct recv_thread_args *args; args = kzalloc(sizeof(*args), GFP_KERNEL); if (!args) { sock_shutdown(nbd); /* * If num_connections is m (2 < m), * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful. * But NO.(n + 1) failed. We still have n recv threads. * So, add flush_workqueue here to prevent recv threads * dropping the last config_refs and trying to destroy * the workqueue from inside the workqueue. */ if (i) flush_workqueue(nbd->recv_workq); return -ENOMEM; } sk_set_memalloc(config->socks[i]->sock->sk); if (nbd->tag_set.timeout) config->socks[i]->sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); args->nbd = nbd; args->nsock = config->socks[i]; args->index = i; queue_work(nbd->recv_workq, &args->work); } return nbd_set_size(nbd, config->bytesize, nbd_blksize(config)); } static int nbd_start_device_ioctl(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; int ret; ret = nbd_start_device(nbd); if (ret) return ret; if (max_part) set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); mutex_unlock(&nbd->config_lock); ret = wait_event_interruptible(config->recv_wq, atomic_read(&config->recv_threads) == 0); if (ret) { sock_shutdown(nbd); nbd_clear_que(nbd); } flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) ret = -ETIMEDOUT; return ret; } static void nbd_clear_sock_ioctl(struct nbd_device *nbd) { nbd_clear_sock(nbd); disk_force_media_change(nbd->disk); nbd_bdev_reset(nbd); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) { nbd->tag_set.timeout = timeout * HZ; if (timeout) blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); else blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ); } /* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: return nbd_disconnect(nbd); case NBD_CLEAR_SOCK: nbd_clear_sock_ioctl(nbd); return 0; case NBD_SET_SOCK: return nbd_add_socket(nbd, arg, false); case NBD_SET_BLKSIZE: return nbd_set_size(nbd, config->bytesize, arg); case NBD_SET_SIZE: return nbd_set_size(nbd, arg, nbd_blksize(config)); case NBD_SET_SIZE_BLOCKS: if (check_shl_overflow(arg, config->blksize_bits, &bytesize)) return -EINVAL; return nbd_set_size(nbd, bytesize, nbd_blksize(config)); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; case NBD_SET_FLAGS: config->flags = arg; return 0; case NBD_DO_IT: return nbd_start_device_ioctl(nbd); case NBD_CLEAR_QUE: /* * This is for compatibility only. The queue is always cleared * by NBD_DO_IT or NBD_CLEAR_SOCK. */ return 0; case NBD_PRINT_DEBUG: /* * For compatibility only, we no longer keep a list of * outstanding requests. */ return 0; } return -ENOTTY; } static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct nbd_device *nbd = bdev->bd_disk->private_data; struct nbd_config *config = nbd->config; int error = -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* The block layer will pass back some non-nbd ioctls in case we have * special handling for them, but we don't so just return an error. */ if (_IOC_TYPE(cmd) != 0xab) return -EINVAL; mutex_lock(&nbd->config_lock); /* Don't allow ioctl operations on a nbd device that was created with * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. */ if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) error = __nbd_ioctl(bdev, nbd, cmd, arg); else dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n"); mutex_unlock(&nbd->config_lock); return error; } static int nbd_alloc_and_init_config(struct nbd_device *nbd) { struct nbd_config *config; if (WARN_ON(nbd->config)) return -EINVAL; if (!try_module_get(THIS_MODULE)) return -ENODEV; config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); if (!config) { module_put(THIS_MODULE); return -ENOMEM; } atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); nbd->config = config; /* * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment, * its pair is the barrier in nbd_get_config_unlocked(). * So nbd_get_config_unlocked() won't see nbd->config as null after * refcount_inc_not_zero() succeed. */ smp_mb__before_atomic(); refcount_set(&nbd->config_refs, 1); return 0; } static int nbd_open(struct gendisk *disk, blk_mode_t mode) { struct nbd_device *nbd; struct nbd_config *config; int ret = 0; mutex_lock(&nbd_index_mutex); nbd = disk->private_data; if (!nbd) { ret = -ENXIO; goto out; } if (!refcount_inc_not_zero(&nbd->refs)) { ret = -ENXIO; goto out; } config = nbd_get_config_unlocked(nbd); if (!config) { mutex_lock(&nbd->config_lock); if (refcount_inc_not_zero(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); goto out; } ret = nbd_alloc_and_init_config(nbd); if (ret) { mutex_unlock(&nbd->config_lock); goto out; } refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); } else if (nbd_disconnected(config)) { if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); } out: mutex_unlock(&nbd_index_mutex); return ret; } static void nbd_release(struct gendisk *disk) { struct nbd_device *nbd = disk->private_data; if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && disk_openers(disk) == 0) nbd_disconnect_and_put(nbd); nbd_config_put(nbd); nbd_put(nbd); } static void nbd_free_disk(struct gendisk *disk) { struct nbd_device *nbd = disk->private_data; kfree(nbd); } static const struct block_device_operations nbd_fops = { .owner = THIS_MODULE, .open = nbd_open, .release = nbd_release, .ioctl = nbd_ioctl, .compat_ioctl = nbd_ioctl, .free_disk = nbd_free_disk, }; #if IS_ENABLED(CONFIG_DEBUG_FS) static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; if (nbd->pid) seq_printf(s, "recv: %d\n", nbd->pid); return 0; } DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks); static int nbd_dbg_flags_show(struct seq_file *s, void *unused) { struct nbd_device *nbd = s->private; u32 flags = nbd->config->flags; seq_printf(s, "Hex: 0x%08x\n\n", flags); seq_puts(s, "Known flags:\n"); if (flags & NBD_FLAG_HAS_FLAGS) seq_puts(s, "NBD_FLAG_HAS_FLAGS\n"); if (flags & NBD_FLAG_READ_ONLY) seq_puts(s, "NBD_FLAG_READ_ONLY\n"); if (flags & NBD_FLAG_SEND_FLUSH) seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); if (flags & NBD_FLAG_SEND_FUA) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); if (flags & NBD_FLAG_SEND_WRITE_ZEROES) seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); if (flags & NBD_FLAG_ROTATIONAL) seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); return 0; } DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags); static int nbd_dev_dbg_init(struct nbd_device *nbd) { struct dentry *dir; struct nbd_config *config = nbd->config; if (!nbd_dbg_dir) return -EIO; dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); if (IS_ERR(dir)) { dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n", nbd_name(nbd)); return -EIO; } config->dbg_dir = dir; debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops); debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits); debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops); return 0; } static void nbd_dev_dbg_close(struct nbd_device *nbd) { debugfs_remove_recursive(nbd->config->dbg_dir); } static int nbd_dbg_init(void) { struct dentry *dbg_dir; dbg_dir = debugfs_create_dir("nbd", NULL); if (IS_ERR(dbg_dir)) return -EIO; nbd_dbg_dir = dbg_dir; return 0; } static void nbd_dbg_close(void) { debugfs_remove_recursive(nbd_dbg_dir); } #else /* IS_ENABLED(CONFIG_DEBUG_FS) */ static int nbd_dev_dbg_init(struct nbd_device *nbd) { return 0; } static void nbd_dev_dbg_close(struct nbd_device *nbd) { } static int nbd_dbg_init(void) { return 0; } static void nbd_dbg_close(void) { } #endif static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->nbd = set->driver_data; cmd->flags = 0; mutex_init(&cmd->lock); return 0; } static const struct blk_mq_ops nbd_mq_ops = { .queue_rq = nbd_queue_rq, .complete = nbd_complete_rq, .init_request = nbd_init_request, .timeout = nbd_xmit_timeout, }; static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct queue_limits lim = { .max_hw_sectors = 65536, .io_opt = 256 << SECTOR_SHIFT, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; struct nbd_device *nbd; struct gendisk *disk; int err = -ENOMEM; nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); if (!nbd) goto out; nbd->tag_set.ops = &nbd_mq_ops; nbd->tag_set.nr_hw_queues = 1; nbd->tag_set.queue_depth = 128; nbd->tag_set.numa_node = NUMA_NO_NODE; nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); nbd->tag_set.flags = BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->backend = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) goto out_free_nbd; mutex_lock(&nbd_index_mutex); if (index >= 0) { err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, GFP_KERNEL); if (err == -ENOSPC) err = -EEXIST; } else { err = idr_alloc(&nbd_index_idr, nbd, 0, (MINORMASK >> part_shift) + 1, GFP_KERNEL); if (err >= 0) index = err; } nbd->index = index; mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_idr; } nbd->disk = disk; nbd->recv_workq = alloc_workqueue("nbd%d-recv", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0, nbd->index); if (!nbd->recv_workq) { dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n"); err = -ENOMEM; goto out_err_disk; } mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); /* * Start out with a zero references to keep other threads from using * this device until it is fully initialized. */ refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; disk->first_minor = index << part_shift; disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); err = add_disk(disk); if (err) goto out_free_work; /* * Now publish the device. */ refcount_set(&nbd->refs, refs); nbd_total_devices++; return nbd; out_free_work: destroy_workqueue(nbd->recv_workq); out_err_disk: put_disk(disk); out_free_idr: mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); mutex_unlock(&nbd_index_mutex); out_free_tags: blk_mq_free_tag_set(&nbd->tag_set); out_free_nbd: kfree(nbd); out: return ERR_PTR(err); } static struct nbd_device *nbd_find_get_unused(void) { struct nbd_device *nbd; int id; lockdep_assert_held(&nbd_index_mutex); idr_for_each_entry(&nbd_index_idr, nbd, id) { if (refcount_read(&nbd->config_refs) || test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) continue; if (refcount_inc_not_zero(&nbd->refs)) return nbd; } return NULL; } /* Netlink interface. */ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { [NBD_ATTR_INDEX] = { .type = NLA_U32 }, [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 }, [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 }, [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 }, [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 }, [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 }, [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED}, [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 }, [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED}, [NBD_ATTR_BACKEND_IDENTIFIER] = { .type = NLA_STRING}, }; static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = { [NBD_SOCK_FD] = { .type = NLA_U32 }, }; /* We don't use this right now since we don't parse the incoming list, but we * still want it here so userspace knows what to expect. */ static const struct nla_policy __attribute__((unused)) nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { [NBD_DEVICE_INDEX] = { .type = NLA_U32 }, [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, }; static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) { struct nbd_config *config = nbd->config; u64 bsize = nbd_blksize(config); u64 bytes = config->bytesize; if (info->attrs[NBD_ATTR_SIZE_BYTES]) bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); if (bytes != config->bytesize || bsize != nbd_blksize(config)) return nbd_set_size(nbd, bytes, bsize); return 0; } static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd; struct nbd_config *config; int index = -1; int ret; bool put_dev = false; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; if (info->attrs[NBD_ATTR_INDEX]) { index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); /* * Too big first_minor can cause duplicate creation of * sysfs files/links, since index << part_shift might overflow, or * MKDEV() expect that the max bits of first_minor is 20. */ if (index < 0 || index > MINORMASK >> part_shift) { pr_err("illegal input index %d\n", index); return -EINVAL; } } if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) { pr_err("must specify at least one socket\n"); return -EINVAL; } if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) { pr_err("must specify a size in bytes for the device\n"); return -EINVAL; } again: mutex_lock(&nbd_index_mutex); if (index == -1) { nbd = nbd_find_get_unused(); } else { nbd = idr_find(&nbd_index_idr, index); if (nbd) { if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || !refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); pr_err("device at index %d is going down\n", index); return -EINVAL; } } } mutex_unlock(&nbd_index_mutex); if (!nbd) { nbd = nbd_dev_add(index, 2); if (IS_ERR(nbd)) { pr_err("failed to add new device\n"); return PTR_ERR(nbd); } } mutex_lock(&nbd->config_lock); if (refcount_read(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); if (index == -1) goto again; pr_err("nbd%d already in use\n", index); return -EBUSY; } ret = nbd_alloc_and_init_config(nbd); if (ret) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); pr_err("couldn't allocate config\n"); return ret; } config = nbd->config; set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) goto out; if (info->attrs[NBD_ATTR_TIMEOUT]) nbd_set_cmd_timeout(nbd, nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); config->dead_conn_timeout *= HZ; } if (info->attrs[NBD_ATTR_SERVER_FLAGS]) config->flags = nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]); if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { /* * We have 1 ref to keep the device around, and then 1 * ref for our current operation here, which will be * inherited by the config. If we already have * DESTROY_ON_DISCONNECT set then we know we don't have * that extra ref already held so we don't need the * put_dev. */ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) put_dev = true; } else { if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) refcount_inc(&nbd->refs); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } if (info->attrs[NBD_ATTR_SOCKETS]) { struct nlattr *attr; int rem, fd; nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], rem) { struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, attr, nbd_sock_policy, info->extack); if (ret != 0) { pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } if (!socks[NBD_SOCK_FD]) continue; fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); ret = nbd_add_socket(nbd, fd, true); if (ret) goto out; } } if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) { nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER], GFP_KERNEL); if (!nbd->backend) { ret = -ENOMEM; goto out; } } ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed for backend!\n"); goto out; } set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags); ret = nbd_start_device(nbd); out: mutex_unlock(&nbd->config_lock); if (!ret) { set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } nbd_config_put(nbd); if (put_dev) nbd_put(nbd); return ret; } static void nbd_disconnect_and_put(struct nbd_device *nbd) { mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); sock_shutdown(nbd); wake_up(&nbd->config->conn_wait); /* * Make sure recv thread has finished, we can safely call nbd_clear_que() * to cancel the inflight I/Os. */ flush_workqueue(nbd->recv_workq); nbd_clear_que(nbd); nbd->task_setup = NULL; clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags); mutex_unlock(&nbd->config_lock); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd; int index; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify an index to disconnect\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); mutex_lock(&nbd_index_mutex); nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); pr_err("couldn't find device at index %d\n", index); return -EINVAL; } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); if (!refcount_inc_not_zero(&nbd->config_refs)) goto put_nbd; nbd_disconnect_and_put(nbd); nbd_config_put(nbd); put_nbd: nbd_put(nbd); return 0; } static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd = NULL; struct nbd_config *config; int index; int ret = 0; bool put_dev = false; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify a device to reconfigure\n"); return -EINVAL; } index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); mutex_lock(&nbd_index_mutex); nbd = idr_find(&nbd_index_idr, index); if (!nbd) { mutex_unlock(&nbd_index_mutex); pr_err("couldn't find a device at index %d\n", index); return -EINVAL; } if (nbd->backend) { if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) { if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER], nbd->backend)) { mutex_unlock(&nbd_index_mutex); dev_err(nbd_to_dev(nbd), "backend image doesn't match with %s\n", nbd->backend); return -EINVAL; } } else { mutex_unlock(&nbd_index_mutex); dev_err(nbd_to_dev(nbd), "must specify backend\n"); return -EINVAL; } } if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); pr_err("device at index %d is going down\n", index); return -EINVAL; } mutex_unlock(&nbd_index_mutex); config = nbd_get_config_unlocked(nbd); if (!config) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); nbd_put(nbd); return -EINVAL; } mutex_lock(&nbd->config_lock); if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->pid) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); ret = -EINVAL; goto out; } ret = nbd_genl_size_set(info, nbd); if (ret) goto out; if (info->attrs[NBD_ATTR_TIMEOUT]) nbd_set_cmd_timeout(nbd, nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); config->dead_conn_timeout *= HZ; } if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) put_dev = true; } else { if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) refcount_inc(&nbd->refs); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } else { clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } if (info->attrs[NBD_ATTR_SOCKETS]) { struct nlattr *attr; int rem, fd; nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], rem) { struct nlattr *socks[NBD_SOCK_MAX+1]; if (nla_type(attr) != NBD_SOCK_ITEM) { pr_err("socks must be embedded in a SOCK_ITEM attr\n"); ret = -EINVAL; goto out; } ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, attr, nbd_sock_policy, info->extack); if (ret != 0) { pr_err("error processing sock list\n"); ret = -EINVAL; goto out; } if (!socks[NBD_SOCK_FD]) continue; fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); ret = nbd_reconnect_socket(nbd, fd); if (ret) { if (ret == -ENOSPC) ret = 0; goto out; } dev_info(nbd_to_dev(nbd), "reconnected socket\n"); } } out: mutex_unlock(&nbd->config_lock); nbd_config_put(nbd); nbd_put(nbd); if (put_dev) nbd_put(nbd); return ret; } static const struct genl_small_ops nbd_connect_genl_ops[] = { { .cmd = NBD_CMD_CONNECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nbd_genl_connect, }, { .cmd = NBD_CMD_DISCONNECT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nbd_genl_disconnect, }, { .cmd = NBD_CMD_RECONFIGURE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nbd_genl_reconfigure, }, { .cmd = NBD_CMD_STATUS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nbd_genl_status, }, }; static const struct genl_multicast_group nbd_mcast_grps[] = { { .name = NBD_GENL_MCAST_GROUP_NAME, }, }; static struct genl_family nbd_genl_family __ro_after_init = { .hdrsize = 0, .name = NBD_GENL_FAMILY_NAME, .version = NBD_GENL_VERSION, .module = THIS_MODULE, .small_ops = nbd_connect_genl_ops, .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), .resv_start_op = NBD_CMD_STATUS + 1, .maxattr = NBD_ATTR_MAX, .netnsok = 1, .policy = nbd_attr_policy, .mcgrps = nbd_mcast_grps, .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), }; MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME); static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) { struct nlattr *dev_opt; u8 connected = 0; int ret; /* This is a little racey, but for status it's ok. The * reason we don't take a ref here is because we can't * take a ref in the index == -1 case as we would need * to put under the nbd_index_mutex, which could * deadlock if we are configured to remove ourselves * once we're disconnected. */ if (refcount_read(&nbd->config_refs)) connected = 1; dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM); if (!dev_opt) return -EMSGSIZE; ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index); if (ret) return -EMSGSIZE; ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED, connected); if (ret) return -EMSGSIZE; nla_nest_end(reply, dev_opt); return 0; } static int status_cb(int id, void *ptr, void *data) { struct nbd_device *nbd = ptr; return populate_nbd_status(nbd, (struct sk_buff *)data); } static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) { struct nlattr *dev_list; struct sk_buff *reply; void *reply_head; size_t msg_size; int index = -1; int ret = -ENOMEM; if (info->attrs[NBD_ATTR_INDEX]) index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); mutex_lock(&nbd_index_mutex); msg_size = nla_total_size(nla_attr_size(sizeof(u32)) + nla_attr_size(sizeof(u8))); msg_size *= (index == -1) ? nbd_total_devices : 1; reply = genlmsg_new(msg_size, GFP_KERNEL); if (!reply) goto out; reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0, NBD_CMD_STATUS); if (!reply_head) { nlmsg_free(reply); goto out; } dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); if (!dev_list) { nlmsg_free(reply); ret = -EMSGSIZE; goto out; } if (index == -1) { ret = idr_for_each(&nbd_index_idr, &status_cb, reply); if (ret) { nlmsg_free(reply); goto out; } } else { struct nbd_device *nbd; nbd = idr_find(&nbd_index_idr, index); if (nbd) { ret = populate_nbd_status(nbd, reply); if (ret) { nlmsg_free(reply); goto out; } } } nla_nest_end(reply, dev_list); genlmsg_end(reply, reply_head); ret = genlmsg_reply(reply, info); out: mutex_unlock(&nbd_index_mutex); return ret; } static void nbd_connect_reply(struct genl_info *info, int index) { struct sk_buff *skb; void *msg_head; int ret; skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); if (!skb) return; msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0, NBD_CMD_CONNECT); if (!msg_head) { nlmsg_free(skb); return; } ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); if (ret) { nlmsg_free(skb); return; } genlmsg_end(skb, msg_head); genlmsg_reply(skb, info); } static void nbd_mcast_index(int index) { struct sk_buff *skb; void *msg_head; int ret; skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); if (!skb) return; msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0, NBD_CMD_LINK_DEAD); if (!msg_head) { nlmsg_free(skb); return; } ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); if (ret) { nlmsg_free(skb); return; } genlmsg_end(skb, msg_head); genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL); } static void nbd_dead_link_work(struct work_struct *work) { struct link_dead_args *args = container_of(work, struct link_dead_args, work); nbd_mcast_index(args->index); kfree(args); } static int __init nbd_init(void) { int i; BUILD_BUG_ON(sizeof(struct nbd_request) != 28); if (max_part < 0) { pr_err("max_part must be >= 0\n"); return -EINVAL; } part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); /* * Adjust max_part according to part_shift as it is exported * to user space so that user can know the max number of * partition kernel should be able to manage. * * Note that -1 is required because partition 0 is reserved * for the whole disk. */ max_part = (1UL << part_shift) - 1; } if ((1UL << part_shift) > DISK_MAX_PARTS) return -EINVAL; if (nbds_max > 1UL << (MINORBITS - part_shift)) return -EINVAL; if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0); if (!nbd_del_wq) { unregister_blkdev(NBD_MAJOR, "nbd"); return -ENOMEM; } if (genl_register_family(&nbd_genl_family)) { destroy_workqueue(nbd_del_wq); unregister_blkdev(NBD_MAJOR, "nbd"); return -EINVAL; } nbd_dbg_init(); for (i = 0; i < nbds_max; i++) nbd_dev_add(i, 1); return 0; } static int nbd_exit_cb(int id, void *ptr, void *data) { struct list_head *list = (struct list_head *)data; struct nbd_device *nbd = ptr; /* Skip nbd that is being removed asynchronously */ if (refcount_read(&nbd->refs)) list_add_tail(&nbd->list, list); return 0; } static void __exit nbd_cleanup(void) { struct nbd_device *nbd; LIST_HEAD(del_list); /* * Unregister netlink interface prior to waiting * for the completion of netlink commands. */ genl_unregister_family(&nbd_genl_family); nbd_dbg_close(); mutex_lock(&nbd_index_mutex); idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list); mutex_unlock(&nbd_index_mutex); while (!list_empty(&del_list)) { nbd = list_first_entry(&del_list, struct nbd_device, list); list_del_init(&nbd->list); if (refcount_read(&nbd->config_refs)) pr_err("possibly leaking nbd_config (ref %d)\n", refcount_read(&nbd->config_refs)); if (refcount_read(&nbd->refs) != 1) pr_err("possibly leaking a device\n"); nbd_put(nbd); } /* Also wait for nbd_dev_remove_work() completes */ destroy_workqueue(nbd_del_wq); idr_destroy(&nbd_index_idr); unregister_blkdev(NBD_MAJOR, "nbd"); } module_init(nbd_init); module_exit(nbd_cleanup); MODULE_DESCRIPTION("Network Block Device"); MODULE_LICENSE("GPL"); module_param(nbds_max, int, 0444); MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)"); module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2005 Henk Vergonet <Henk.Vergonet@gmail.com> */ #ifndef MAP_TO_7SEGMENT_H #define MAP_TO_7SEGMENT_H /* This file provides translation primitives and tables for the conversion * of (ASCII) characters to a 7-segments notation. * * The 7 segment's wikipedia notation below is used as standard. * See: https://en.wikipedia.org/wiki/Seven_segment_display * * Notation: +-a-+ * f b * +-g-+ * e c * +-d-+ * * Usage: * * Register a map variable, and fill it with a character set: * static SEG7_DEFAULT_MAP(map_seg7); * * * Then use for conversion: * seg7 = map_to_seg7(&map_seg7, some_char); * ... * * In device drivers it is recommended, if required, to make the char map * accessible via the sysfs interface using the following scheme: * * static ssize_t map_seg7_show(struct device *dev, * struct device_attribute *attr, char *buf) * { * memcpy(buf, &map_seg7, sizeof(map_seg7)); * return sizeof(map_seg7); * } * static ssize_t map_seg7_store(struct device *dev, * struct device_attribute *attr, const char *buf, * size_t cnt) * { * if(cnt != sizeof(map_seg7)) * return -EINVAL; * memcpy(&map_seg7, buf, cnt); * return cnt; * } * static DEVICE_ATTR_RW(map_seg7); * * History: * 2005-05-31 RFC linux-kernel@vger.kernel.org */ #include <linux/errno.h> #define BIT_SEG7_A 0 #define BIT_SEG7_B 1 #define BIT_SEG7_C 2 #define BIT_SEG7_D 3 #define BIT_SEG7_E 4 #define BIT_SEG7_F 5 #define BIT_SEG7_G 6 #define BIT_SEG7_RESERVED 7 struct seg7_conversion_map { unsigned char table[128]; }; static __inline__ int map_to_seg7(struct seg7_conversion_map *map, int c) { return c >= 0 && c < sizeof(map->table) ? map->table[c] : -EINVAL; } #define SEG7_CONVERSION_MAP(_name, _map) \ struct seg7_conversion_map _name = { .table = { _map } } /* * It is recommended to use a facility that allows user space to redefine * custom character sets for LCD devices. Please use a sysfs interface * as described above. */ #define MAP_TO_SEG7_SYSFS_FILE "map_seg7" /******************************************************************************* * ASCII conversion table ******************************************************************************/ #define _SEG7(l,a,b,c,d,e,f,g) \ ( a<<BIT_SEG7_A | b<<BIT_SEG7_B | c<<BIT_SEG7_C | d<<BIT_SEG7_D | \ e<<BIT_SEG7_E | f<<BIT_SEG7_F | g<<BIT_SEG7_G ) #define _MAP_0_32_ASCII_SEG7_NON_PRINTABLE \ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, #define _MAP_33_47_ASCII_SEG7_SYMBOL \ _SEG7('!',0,0,0,0,1,1,0), _SEG7('"',0,1,0,0,0,1,0), _SEG7('#',0,1,1,0,1,1,0),\ _SEG7('$',1,0,1,1,0,1,1), _SEG7('%',0,0,1,0,0,1,0), _SEG7('&',1,0,1,1,1,1,1),\ _SEG7('\'',0,0,0,0,0,1,0),_SEG7('(',1,0,0,1,1,1,0), _SEG7(')',1,1,1,1,0,0,0),\ _SEG7('*',0,1,1,0,1,1,1), _SEG7('+',0,1,1,0,0,0,1), _SEG7(',',0,0,0,0,1,0,0),\ _SEG7('-',0,0,0,0,0,0,1), _SEG7('.',0,0,0,0,1,0,0), _SEG7('/',0,1,0,0,1,0,1), #define _MAP_48_57_ASCII_SEG7_NUMERIC \ _SEG7('0',1,1,1,1,1,1,0), _SEG7('1',0,1,1,0,0,0,0), _SEG7('2',1,1,0,1,1,0,1),\ _SEG7('3',1,1,1,1,0,0,1), _SEG7('4',0,1,1,0,0,1,1), _SEG7('5',1,0,1,1,0,1,1),\ _SEG7('6',1,0,1,1,1,1,1), _SEG7('7',1,1,1,0,0,0,0), _SEG7('8',1,1,1,1,1,1,1),\ _SEG7('9',1,1,1,1,0,1,1), #define _MAP_58_64_ASCII_SEG7_SYMBOL \ _SEG7(':',0,0,0,1,0,0,1), _SEG7(';',0,0,0,1,0,0,1), _SEG7('<',1,0,0,0,0,1,1),\ _SEG7('=',0,0,0,1,0,0,1), _SEG7('>',1,1,0,0,0,0,1), _SEG7('?',1,1,1,0,0,1,0),\ _SEG7('@',1,1,0,1,1,1,1), #define _MAP_65_90_ASCII_SEG7_ALPHA_UPPR \ _SEG7('A',1,1,1,0,1,1,1), _SEG7('B',1,1,1,1,1,1,1), _SEG7('C',1,0,0,1,1,1,0),\ _SEG7('D',1,1,1,1,1,1,0), _SEG7('E',1,0,0,1,1,1,1), _SEG7('F',1,0,0,0,1,1,1),\ _SEG7('G',1,1,1,1,0,1,1), _SEG7('H',0,1,1,0,1,1,1), _SEG7('I',0,1,1,0,0,0,0),\ _SEG7('J',0,1,1,1,0,0,0), _SEG7('K',0,1,1,0,1,1,1), _SEG7('L',0,0,0,1,1,1,0),\ _SEG7('M',1,1,1,0,1,1,0), _SEG7('N',1,1,1,0,1,1,0), _SEG7('O',1,1,1,1,1,1,0),\ _SEG7('P',1,1,0,0,1,1,1), _SEG7('Q',1,1,1,1,1,1,0), _SEG7('R',1,1,1,0,1,1,1),\ _SEG7('S',1,0,1,1,0,1,1), _SEG7('T',0,0,0,1,1,1,1), _SEG7('U',0,1,1,1,1,1,0),\ _SEG7('V',0,1,1,1,1,1,0), _SEG7('W',0,1,1,1,1,1,1), _SEG7('X',0,1,1,0,1,1,1),\ _SEG7('Y',0,1,1,0,0,1,1), _SEG7('Z',1,1,0,1,1,0,1), #define _MAP_91_96_ASCII_SEG7_SYMBOL \ _SEG7('[',1,0,0,1,1,1,0), _SEG7('\\',0,0,1,0,0,1,1),_SEG7(']',1,1,1,1,0,0,0),\ _SEG7('^',1,1,0,0,0,1,0), _SEG7('_',0,0,0,1,0,0,0), _SEG7('`',0,1,0,0,0,0,0), #define _MAP_97_122_ASCII_SEG7_ALPHA_LOWER \ _SEG7('A',1,1,1,0,1,1,1), _SEG7('b',0,0,1,1,1,1,1), _SEG7('c',0,0,0,1,1,0,1),\ _SEG7('d',0,1,1,1,1,0,1), _SEG7('E',1,0,0,1,1,1,1), _SEG7('F',1,0,0,0,1,1,1),\ _SEG7('G',1,1,1,1,0,1,1), _SEG7('h',0,0,1,0,1,1,1), _SEG7('i',0,0,1,0,0,0,0),\ _SEG7('j',0,0,1,1,0,0,0), _SEG7('k',0,0,1,0,1,1,1), _SEG7('L',0,0,0,1,1,1,0),\ _SEG7('M',1,1,1,0,1,1,0), _SEG7('n',0,0,1,0,1,0,1), _SEG7('o',0,0,1,1,1,0,1),\ _SEG7('P',1,1,0,0,1,1,1), _SEG7('q',1,1,1,0,0,1,1), _SEG7('r',0,0,0,0,1,0,1),\ _SEG7('S',1,0,1,1,0,1,1), _SEG7('T',0,0,0,1,1,1,1), _SEG7('u',0,0,1,1,1,0,0),\ _SEG7('v',0,0,1,1,1,0,0), _SEG7('W',0,1,1,1,1,1,1), _SEG7('X',0,1,1,0,1,1,1),\ _SEG7('y',0,1,1,1,0,1,1), _SEG7('Z',1,1,0,1,1,0,1), #define _MAP_123_126_ASCII_SEG7_SYMBOL \ _SEG7('{',1,0,0,1,1,1,0), _SEG7('|',0,0,0,0,1,1,0), _SEG7('}',1,1,1,1,0,0,0),\ _SEG7('~',1,0,0,0,0,0,0), /* Maps */ /* This set tries to map as close as possible to the visible characteristics * of the ASCII symbol, lowercase and uppercase letters may differ in * presentation on the display. */ #define MAP_ASCII7SEG_ALPHANUM \ _MAP_0_32_ASCII_SEG7_NON_PRINTABLE \ _MAP_33_47_ASCII_SEG7_SYMBOL \ _MAP_48_57_ASCII_SEG7_NUMERIC \ _MAP_58_64_ASCII_SEG7_SYMBOL \ _MAP_65_90_ASCII_SEG7_ALPHA_UPPR \ _MAP_91_96_ASCII_SEG7_SYMBOL \ _MAP_97_122_ASCII_SEG7_ALPHA_LOWER \ _MAP_123_126_ASCII_SEG7_SYMBOL /* This set tries to map as close as possible to the symbolic characteristics * of the ASCII character for maximum discrimination. * For now this means all alpha chars are in lower case representations. * (This for example facilitates the use of hex numbers with uppercase input.) */ #define MAP_ASCII7SEG_ALPHANUM_LC \ _MAP_0_32_ASCII_SEG7_NON_PRINTABLE \ _MAP_33_47_ASCII_SEG7_SYMBOL \ _MAP_48_57_ASCII_SEG7_NUMERIC \ _MAP_58_64_ASCII_SEG7_SYMBOL \ _MAP_97_122_ASCII_SEG7_ALPHA_LOWER \ _MAP_91_96_ASCII_SEG7_SYMBOL \ _MAP_97_122_ASCII_SEG7_ALPHA_LOWER \ _MAP_123_126_ASCII_SEG7_SYMBOL #define SEG7_DEFAULT_MAP(_name) \ SEG7_CONVERSION_MAP(_name,MAP_ASCII7SEG_ALPHANUM) #endif /* MAP_TO_7SEGMENT_H */
72 73 1 73 3 73 72 73 73 73 24 1 2 1 1 1 1 2 1 1 2 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 // SPDX-License-Identifier: GPL-2.0+ /* * Restartable sequences system call * * Copyright (C) 2015, Google, Inc., * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> * Copyright (C) 2015-2018, EfficiOS Inc., * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> #include <linux/ratelimit.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) #ifdef CONFIG_DEBUG_RSEQ static struct rseq *rseq_kernel_fields(struct task_struct *t) { return (struct rseq *) t->rseq_fields; } static int rseq_validate_ro_fields(struct task_struct *t) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); u32 cpu_id_start, cpu_id, node_id, mm_cid; struct rseq __user *rseq = t->rseq; /* * Validate fields which are required to be read-only by * user-space. */ if (!user_read_access_begin(rseq, t->rseq_len)) goto efault; unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_get_user(node_id, &rseq->node_id, efault_end); unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); user_read_access_end(); if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || cpu_id != rseq_kernel_fields(t)->cpu_id || node_id != rseq_kernel_fields(t)->node_id || mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { pr_warn("Detected rseq corruption for pid: %d, name: %s\n" "\tcpu_id_start: %u ?= %u\n" "\tcpu_id: %u ?= %u\n" "\tnode_id: %u ?= %u\n" "\tmm_cid: %u ?= %u\n", t->pid, t->comm, cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, cpu_id, rseq_kernel_fields(t)->cpu_id, node_id, rseq_kernel_fields(t)->node_id, mm_cid, rseq_kernel_fields(t)->mm_cid); } /* For now, only print a console warning on mismatch. */ return 0; efault_end: user_read_access_end(); efault: return -EFAULT; } /* * Update an rseq field and its in-kernel copy in lock-step to keep a coherent * state. */ #define rseq_unsafe_put_user(t, value, field, error_label) \ do { \ unsafe_put_user(value, &t->rseq->field, error_label); \ rseq_kernel_fields(t)->field = value; \ } while (0) #else static int rseq_validate_ro_fields(struct task_struct *t) { return 0; } #define rseq_unsafe_put_user(t, value, field, error_label) \ unsafe_put_user(value, &t->rseq->field, error_label) #endif /* * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing * per-cpu operations. * * It allows user-space to perform update operations on per-cpu data * without requiring heavy-weight atomic operations. * * Detailed algorithm of rseq user-space assembly sequences: * * init(rseq_cs) * cpu = TLS->rseq::cpu_id_start * [1] TLS->rseq::rseq_cs = rseq_cs * [start_ip] ---------------------------- * [2] if (cpu != TLS->rseq::cpu_id) * goto abort_ip; * [3] <last_instruction_in_cs> * [post_commit_ip] ---------------------------- * * The address of jump target abort_ip must be outside the critical * region, i.e.: * * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] * * Steps [2]-[3] (inclusive) need to be a sequence of instructions in * userspace that can handle being interrupted between any of those * instructions, and then resumed to the abort_ip. * * 1. Userspace stores the address of the struct rseq_cs assembly * block descriptor into the rseq_cs field of the registered * struct rseq TLS area. This update is performed through a single * store within the inline assembly instruction sequence. * [start_ip] * * 2. Userspace tests to check whether the current cpu_id field match * the cpu number loaded before start_ip, branching to abort_ip * in case of a mismatch. * * If the sequence is preempted or interrupted by a signal * at or after start_ip and before post_commit_ip, then the kernel * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return * ip to abort_ip before returning to user-space, so the preempted * execution resumes at abort_ip. * * 3. Userspace critical section final instruction before * post_commit_ip is the commit. The critical section is * self-terminating. * [post_commit_ip] * * 4. <success> * * On failure at [2], or if interrupted by preempt or signal delivery * between [1] and [3]: * * [abort_ip] * F1. <failure> */ static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); /* * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) goto efault; WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); rseq_unsafe_put_user(t, node_id, node_id, efault_end); rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); trace_rseq_update(t); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) goto efault; if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; /* * Reset all fields to their initial state. * * All fields have an initial state of 0 except cpu_id which is set to * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after * unregistration can figure out that rseq needs to be registered * again. */ rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); rseq_unsafe_put_user(t, node_id, node_id, efault_end); rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } /* * Get the user-space pointer value stored in the 'rseq_cs' field. */ static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) { if (!rseq_cs) return -EFAULT; #ifdef CONFIG_64BIT if (get_user(*rseq_cs, &rseq->rseq_cs)) return -EFAULT; #else if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) return -EFAULT; #endif return 0; } /* * If the rseq_cs field of 'struct rseq' contains a valid pointer to * user-space, copy 'struct rseq_cs' from user-space and validate its fields. */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; u64 ptr; u32 __user *usig; u32 sig; int ret; ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); if (ret) return ret; /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; if (rseq_cs->start_ip >= TASK_SIZE || rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || rseq_cs->abort_ip >= TASK_SIZE || rseq_cs->version > 0) return -EINVAL; /* Check for overflow. */ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; if (current->rseq_sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); return -EINVAL; } return 0; } static bool rseq_warn_flags(const char *str, u32 flags) { u32 test_flags; if (!flags) return false; test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); return true; } static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* * Load and clear event mask atomically with respect to * scheduler preemption. */ preempt_disable(); event_mask = t->rseq_event_mask; t->rseq_event_mask = 0; preempt_enable(); return !!event_mask; } static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal * delivery on top of rseq assembly block, as well as on top * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT return put_user(0UL, &rseq->rseq_cs); #else if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif } /* * Unsigned comparison will be true when ip >= start_ip, and when * ip < start_ip + post_commit_offset. */ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) { return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; int ret; ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; } /* * This resume handler must always be executed between any of: * - preemption, * - signal delivery, * and return to user-space. * * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; /* * regs is NULL if and only if the caller is in a syscall path. Skip * fixup and leave rseq_cs as is so that rseq_sycall() will detect and * kill a misbehaving userspace on debug kernels. */ if (regs) { ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) goto error; } if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; error: sig = ksig ? ksig->sig : 0; force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; if (!t->rseq) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } #endif /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; current->rseq_len = 0; return 0; } if (unlikely(flags)) return -EINVAL; if (current->rseq) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; } /* * If there was no rseq previously registered, ensure the provided rseq * is properly aligned, as communcated to user-space through the ELF * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * * In order to be valid, rseq_len is either the original rseq size, or * large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; /* * If the rseq_cs pointer is non-NULL on registration, clear it to * avoid a potential segfault on return to user-space. The proper thing * to do would have been to fail the registration but this would break * older libcs that reuse the rseq area for new threads without * clearing the fields. */ if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) return -EFAULT; if (rseq_cs && clear_rseq_cs(rseq)) return -EFAULT; #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of * read-only fields. */ if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) return -EFAULT; #endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ rseq_set_notify_resume(current); return 0; }
983 374 647 652 439 438 882 29 1799 1778 978 434 882 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(&current->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; fallthrough; case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; }
99 100 5 2 2 2 2 11 11 2 5 2 2 2 2 2 5 4 5 6 11 11 84 85 85 85 86 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 36