Total coverage: 387187 (19%)of 2084730
1 1 1 1 3 3 3 3 3 3 3 3 8 5 3 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-or-later /* System trusted keyring for trusted public keys * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/uidgid.h> #include <linux/verification.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING static struct key *secondary_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING static struct key *machine_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING static struct key *platform_trusted_keys; #endif extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** * restrict_link_by_builtin_trusted - Restrict keyring addition by built-in CA * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. */ int restrict_link_by_builtin_trusted(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_signature(dest_keyring, type, payload, builtin_trusted_keys); } /** * restrict_link_by_digsig_builtin - Restrict digitalSignature key additions by the built-in keyring * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. The new key * must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_digsig(dest_keyring, type, payload, builtin_trusted_keys); } #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring * addition by both built-in and secondary keyrings. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. */ int restrict_link_by_builtin_and_secondary_trusted( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_signature(dest_keyring, type, payload, secondary_trusted_keys); } /** * restrict_link_by_digsig_builtin_and_secondary - Restrict by digitalSignature. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. The new key must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin_and_secondary(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_digsig(dest_keyring, type, payload, secondary_trusted_keys); } /* * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ static __init struct key_restriction *get_builtin_and_secondary_restriction(void) { struct key_restriction *restriction; restriction = kzalloc_obj(struct key_restriction); if (!restriction) panic("Can't allocate secondary trusted keyring restriction\n"); if (IS_ENABLED(CONFIG_INTEGRITY_MACHINE_KEYRING)) restriction->check = restrict_link_by_builtin_secondary_and_machine; else restriction->check = restrict_link_by_builtin_and_secondary_trusted; return restriction; } /** * add_to_secondary_keyring - Add to secondary keyring. * @source: Source of key * @data: The blob holding the key * @len: The length of the data blob * * Add a key to the secondary keyring. The key must be vouched for by a key in the builtin, * machine or secondary keyring itself. */ void __init add_to_secondary_keyring(const char *source, const void *data, size_t len) { key_ref_t key; key_perm_t perm; perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; key = key_create_or_update(make_key_ref(secondary_trusted_keys, 1), "asymmetric", NULL, data, len, perm, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { pr_err("Problem loading X.509 certificate from %s to secondary keyring %ld\n", source, PTR_ERR(key)); return; } pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); } #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING void __init set_machine_trusted_keys(struct key *keyring) { machine_trusted_keys = keyring; if (key_link(secondary_trusted_keys, machine_trusted_keys) < 0) panic("Can't link (machine) trusted keyrings\n"); } /** * restrict_link_by_builtin_secondary_and_machine - Restrict keyring addition. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in, the secondary, or * the machine keyrings. */ int restrict_link_by_builtin_secondary_and_machine( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { if (machine_trusted_keys && type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &machine_trusted_keys->payload) /* Allow the machine keyring to be added to the secondary */ return 0; return restrict_link_by_builtin_and_secondary_trusted(dest_keyring, type, payload, restrict_key); } #endif /* * Create the trusted keyrings */ static __init int system_trusted_keyring_init(void) { pr_notice("Initialise system trusted keyrings\n"); builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE), KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) panic("Can't allocate secondary trusted keyring\n"); if (key_link(secondary_trusted_keys, builtin_trusted_keys) < 0) panic("Can't link trusted keyrings\n"); #endif return 0; } /* * Must be initialised before we try and load the keys into the keyring. */ device_initcall(system_trusted_keyring_init); __init int load_module_cert(struct key *keyring) { if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG)) return 0; pr_notice("Loading compiled-in module X.509 certificates\n"); return x509_load_certificate_list(system_certificate_list, module_cert_size, keyring); } /* * Load the compiled-in list of X.509 certificates. */ static __init int load_system_certificate_list(void) { const u8 *p; unsigned long size; pr_notice("Loading compiled-in X.509 certificates\n"); #ifdef CONFIG_MODULE_SIG p = system_certificate_list; size = system_certificate_list_size; #else p = system_certificate_list + module_cert_size; size = system_certificate_list_size - module_cert_size; #endif return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_message_sig(const void *data, size_t len, struct pkcs7_message *pkcs7, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { int ret; /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; } ret = pkcs7_verify(pkcs7, usage); if (ret < 0) goto error; ret = is_key_on_revocation_list(pkcs7); if (ret != -ENOKEY) { pr_devel("PKCS#7 key is on revocation list\n"); goto error; } if (!trusted_keys) { trusted_keys = builtin_trusted_keys; } else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) { #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING trusted_keys = secondary_trusted_keys; #else trusted_keys = builtin_trusted_keys; #endif } else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) { #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING trusted_keys = platform_trusted_keys; #else trusted_keys = NULL; #endif if (!trusted_keys) { ret = -ENOKEY; pr_devel("PKCS#7 platform keyring is not available\n"); goto error; } } ret = pkcs7_validate_trust(pkcs7, trusted_keys); if (ret < 0) { if (ret == -ENOKEY) pr_devel("PKCS#7 signature not signed with a trusted key\n"); goto error; } if (view_content) { size_t asn1hdrlen; ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); if (ret < 0) { if (ret == -ENODATA) pr_devel("PKCS#7 message does not contain data\n"); goto error; } ret = view_content(ctx, data, len, asn1hdrlen); } error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /** * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { struct pkcs7_message *pkcs7; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); if (IS_ERR(pkcs7)) return PTR_ERR(pkcs7); ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, view_content, ctx); pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_pkcs7_signature); #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING void __init set_platform_trusted_keys(struct key *keyring) { platform_trusted_keys = keyring; } #endif
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/sched.h> #include <media/frame_vector.h> /** * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map * @write: the mapped address has write permission * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * * This function maps virtual addresses from @start and fills @vec structure * with page frame numbers or page pointers to corresponding pages (choice * depends on the type of the vma underlying the virtual address). If @start * belongs to a normal vma, the function grabs reference to each of the pages * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't * touch page structures and the caller must make sure pfns aren't reused for * anything else while he is using them. * * The function returns number of pages mapped which may be less than * @nr_frames. In particular we stop mapping if there are more vmas of * different type underlying the specified range of virtual addresses. * When the function isn't able to map a single page, it returns error. * * Note that get_vaddr_frames() cannot follow VM_IO mappings. It used * to be able to do that, but that could (racily) return non-refcounted * pfns. * * This function takes care of grabbing mmap_lock as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, bool write, struct frame_vector *vec) { int ret; unsigned int gup_flags = FOLL_LONGTERM; if (nr_frames == 0) return 0; if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; start = untagged_addr(start); if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(start, nr_frames, gup_flags, (struct page **)(vec->ptrs)); vec->got_ref = true; vec->is_pfns = false; vec->nr_frames = ret; if (likely(ret > 0)) return ret; vec->nr_frames = 0; return ret ? ret : -EFAULT; } EXPORT_SYMBOL(get_vaddr_frames); /** * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired * them * @vec: frame vector to put * * Drop references to pages if get_vaddr_frames() acquired them. We also * invalidate the frame vector so that it is prepared for the next call into * get_vaddr_frames(). */ void put_vaddr_frames(struct frame_vector *vec) { struct page **pages; if (!vec->got_ref) goto out; pages = frame_vector_pages(vec); /* * frame_vector_pages() might needed to do a conversion when * get_vaddr_frames() got pages but vec was later converted to pfns. * But it shouldn't really fail to convert pfns back... */ if (WARN_ON(IS_ERR(pages))) goto out; unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; } EXPORT_SYMBOL(put_vaddr_frames); /** * frame_vector_to_pages - convert frame vector to contain page pointers * @vec: frame vector to convert * * Convert @vec to contain array of page pointers. If the conversion is * successful, return 0. Otherwise return an error. Note that we do not grab * page references for the page structures. */ int frame_vector_to_pages(struct frame_vector *vec) { int i; unsigned long *nums; struct page **pages; if (!vec->is_pfns) return 0; nums = frame_vector_pfns(vec); for (i = 0; i < vec->nr_frames; i++) if (!pfn_valid(nums[i])) return -EINVAL; pages = (struct page **)nums; for (i = 0; i < vec->nr_frames; i++) pages[i] = pfn_to_page(nums[i]); vec->is_pfns = false; return 0; } EXPORT_SYMBOL(frame_vector_to_pages); /** * frame_vector_to_pfns - convert frame vector to contain pfns * @vec: frame vector to convert * * Convert @vec to contain array of pfns. */ void frame_vector_to_pfns(struct frame_vector *vec) { int i; unsigned long *nums; struct page **pages; if (vec->is_pfns) return; pages = (struct page **)(vec->ptrs); nums = (unsigned long *)pages; for (i = 0; i < vec->nr_frames; i++) nums[i] = page_to_pfn(pages[i]); vec->is_pfns = true; } EXPORT_SYMBOL(frame_vector_to_pfns); /** * frame_vector_create() - allocate & initialize structure for pinned pfns * @nr_frames: number of pfns slots we should reserve * * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns * pfns. */ struct frame_vector *frame_vector_create(unsigned int nr_frames) { struct frame_vector *vec; int size = struct_size(vec, ptrs, nr_frames); if (WARN_ON_ONCE(nr_frames == 0)) return NULL; /* * This is absurdly high. It's here just to avoid strange effects when * arithmetics overflows. */ if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) return NULL; /* * Avoid higher order allocations, use vmalloc instead. It should * be rare anyway. */ vec = kvmalloc(size, GFP_KERNEL); if (!vec) return NULL; vec->nr_allocated = nr_frames; vec->nr_frames = 0; return vec; } EXPORT_SYMBOL(frame_vector_create); /** * frame_vector_destroy() - free memory allocated to carry frame vector * @vec: Frame vector to free * * Free structure allocated by frame_vector_create() to carry frames. */ void frame_vector_destroy(struct frame_vector *vec) { /* Make sure put_vaddr_frames() got called properly... */ VM_BUG_ON(vec->nr_frames > 0); kvfree(vec); } EXPORT_SYMBOL(frame_vector_destroy);
1 4 4 3 4 4 4 1 3 4 4 4 4 1 1 1 1 4 3 3 3 2 3 2 3 3 3 3 3 3 3 3 2 2 1 1 1 1 1 2 2 2 2 2 2 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_choke.c CHOKE scheduler * * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com> * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/inet_ecn.h> #include <net/red.h> #include <net/flow_dissector.h> /* CHOKe stateless AQM for fair bandwidth allocation ================================================= CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for unresponsive flows) is a variant of RED that penalizes misbehaving flows but maintains no flow state. The difference from RED is an additional step during the enqueuing process. If average queue size is over the low threshold (qmin), a packet is chosen at random from the queue. If both the new and chosen packet are from the same flow, both are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it needs to access packets in queue randomly. It has a minimal class interface to allow overriding the builtin flow classifier with filters. Source: R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", IEEE INFOCOM, 2000. A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial Characteristics", IEEE/ACM Transactions on Networking, 2004 */ /* Upper bound on size of sk_buff table (packets) */ #define CHOKE_MAX_QUEUE (128*1024 - 1) struct choke_sched_data { /* Parameters */ u32 limit; unsigned char flags; struct red_parms parms; /* Variables */ struct red_vars vars; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ u32 matched; /* Drops to flow match */ } stats; unsigned int head; unsigned int tail; unsigned int tab_mask; /* size - 1 */ struct sk_buff **tab; }; /* number of elements in queue including holes */ static unsigned int choke_len(const struct choke_sched_data *q) { return (q->tail - q->head) & q->tab_mask; } /* Is ECN parameter configured */ static int use_ecn(const struct choke_sched_data *q) { return q->flags & TC_RED_ECN; } /* Should packets over max just be dropped (versus marked) */ static int use_harddrop(const struct choke_sched_data *q) { return q->flags & TC_RED_HARDDROP; } /* Move head pointer forward to skip over holes */ static void choke_zap_head_holes(struct choke_sched_data *q) { do { q->head = (q->head + 1) & q->tab_mask; if (q->head == q->tail) break; } while (q->tab[q->head] == NULL); } /* Move tail pointer backwards to reuse holes */ static void choke_zap_tail_holes(struct choke_sched_data *q) { do { q->tail = (q->tail - 1) & q->tab_mask; if (q->head == q->tail) break; } while (q->tab[q->tail] == NULL); } /* Drop packet from queue array by creating a "hole" */ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = q->tab[idx]; q->tab[idx] = NULL; if (idx == q->head) choke_zap_head_holes(q); if (idx == q->tail) choke_zap_tail_holes(q); --sch->q.qlen; qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch, to_free); } struct choke_skb_cb { u8 keys_valid; struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } /* * Compare flow of two packets * Returns true only if source and destination address and port match. * false for special cases */ static bool choke_match_flow(struct sk_buff *skb1, struct sk_buff *skb2) { struct flow_keys temp; if (skb1->protocol != skb2->protocol) return false; if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; skb_flow_dissect_flow_keys(skb1, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; skb_flow_dissect_flow_keys(skb2, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, sizeof(choke_skb_cb(skb1)->keys)); } /* * Select a packet at random from queue * HACK: since queue can have holes from previous deletion; retry several * times to find a random skb but then just give up and return the head * Will return NULL if queue is empty (q->head == q->tail) */ static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, unsigned int *pidx) { struct sk_buff *skb; int retrys = 3; do { *pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask; skb = q->tab[*pidx]; if (skb) return skb; } while (--retrys > 0); return q->tab[*pidx = q->head]; } /* * Compare new packet with random packet in queue * returns true if matched and sets *pidx */ static bool choke_match_random(const struct choke_sched_data *q, struct sk_buff *nskb, unsigned int *pidx) { struct sk_buff *oskb; if (q->head == q->tail) return false; oskb = choke_peek_random(q, pidx); return choke_match_flow(oskb, nskb); } static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; choke_skb_cb(skb)->keys_valid = 0; /* Compute average queue usage (see RED) */ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); if (red_is_idling(&q->vars)) red_end_of_idle_period(&q->vars); /* Is queue small? */ if (q->vars.qavg <= p->qth_min) q->vars.qcount = -1; else { unsigned int idx; /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { WRITE_ONCE(q->stats.matched, q->stats.matched + 1); choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } /* Queue is large, always mark/drop */ if (q->vars.qavg > p->qth_max) { q->vars.qcount = -1; qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { WRITE_ONCE(q->stats.forced_drop, q->stats.forced_drop + 1); goto congestion_drop; } WRITE_ONCE(q->stats.forced_mark, q->stats.forced_mark + 1); } else if (++q->vars.qcount) { if (red_mark_probability(p, &q->vars, q->vars.qavg)) { q->vars.qcount = 0; q->vars.qR = red_random(p); qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { WRITE_ONCE(q->stats.prob_drop, q->stats.prob_drop + 1); goto congestion_drop; } WRITE_ONCE(q->stats.prob_mark, q->stats.prob_mark + 1); } } else q->vars.qR = red_random(p); } /* Admit new packet */ if (sch->q.qlen < q->limit) { q->tab[q->tail] = skb; q->tail = (q->tail + 1) & q->tab_mask; ++sch->q.qlen; qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } WRITE_ONCE(q->stats.pdrop, q->stats.pdrop + 1); return qdisc_drop(skb, sch, to_free); congestion_drop: qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } static struct sk_buff *choke_dequeue(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; if (q->head == q->tail) { if (!red_is_idling(&q->vars)) red_start_of_idle_period(&q->vars); return NULL; } skb = q->tab[q->head]; q->tab[q->head] = NULL; choke_zap_head_holes(q); --sch->q.qlen; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; } static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; rtnl_qdisc_drop(skb, sch); } if (q->tab) memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); } static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, [TCA_CHOKE_MAX_P] = { .type = NLA_U32 }, }; static void choke_free(void *addr) { kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct choke_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CHOKE_MAX + 1]; const struct tc_red_qopt *ctl; int err; struct sk_buff **old = NULL; unsigned int mask; u32 max_P; u8 *stab; if (opt == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL); if (err < 0) return err; if (tb[TCA_CHOKE_PARMS] == NULL || tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; max_P = nla_get_u32_default(tb[TCA_CHOKE_MAX_P], 0); ctl = nla_data(tb[TCA_CHOKE_PARMS]); stab = nla_data(tb[TCA_CHOKE_STAB]); if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) return -EINVAL; mask = roundup_pow_of_two(ctl->limit + 1) - 1; if (mask != q->tab_mask) { struct sk_buff **ntab; ntab = kvzalloc_objs(struct sk_buff *, mask + 1); if (!ntab) return -ENOMEM; sch_tree_lock(sch); old = q->tab; if (old) { unsigned int oqlen = sch->q.qlen, tail = 0; unsigned dropped = 0; while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; if (tail < mask) { ntab[tail++] = skb; continue; } dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; q->tail = tail; } q->tab_mask = mask; q->tab = ntab; } else sch_tree_lock(sch); WRITE_ONCE(q->flags, ctl->flags); WRITE_ONCE(q->limit, ctl->limit); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, stab, max_P); red_set_vars(&q->vars); if (q->head == q->tail) red_end_of_idle_period(&q->vars); sch_tree_unlock(sch); choke_free(old); return 0; } static int choke_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { return choke_change(sch, opt, extack); } static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) { struct choke_sched_data *q = qdisc_priv(sch); u8 Wlog = READ_ONCE(q->parms.Wlog); struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = READ_ONCE(q->limit), .flags = READ_ONCE(q->flags), .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog, .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog, .Wlog = Wlog, .Plog = READ_ONCE(q->parms.Plog), .Scell_log = READ_ONCE(q->parms.Scell_log), }; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct choke_sched_data *q = qdisc_priv(sch); struct tc_choke_xstats st = { .early = READ_ONCE(q->stats.prob_drop) + READ_ONCE(q->stats.forced_drop), .marked = READ_ONCE(q->stats.prob_mark) + READ_ONCE(q->stats.forced_mark), .pdrop = READ_ONCE(q->stats.pdrop), .matched = READ_ONCE(q->stats.matched), }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static void choke_destroy(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); choke_free(q->tab); } static struct sk_buff *choke_peek_head(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); return (q->head != q->tail) ? q->tab[q->head] : NULL; } static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .id = "choke", .priv_size = sizeof(struct choke_sched_data), .enqueue = choke_enqueue, .dequeue = choke_dequeue, .peek = choke_peek_head, .init = choke_init, .destroy = choke_destroy, .reset = choke_reset, .change = choke_change, .dump = choke_dump, .dump_stats = choke_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("choke"); static int __init choke_module_init(void) { return register_qdisc(&choke_qdisc_ops); } static void __exit choke_module_exit(void) { unregister_qdisc(&choke_qdisc_ops); } module_init(choke_module_init) module_exit(choke_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Choose and keep responsive flows scheduler");
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 // SPDX-License-Identifier: GPL-2.0 /* * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. * * Originally from swsusp. */ #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> #include <linux/cpuset.h> /* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { const char *what = user_only ? "user space processes" : "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; bool wq_busy = false; ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; pr_info("Freezing %s\n", what); start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; todo++; } read_unlock(&tasklist_lock); if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } if (!todo || time_after(jiffies, end_time)) break; if (pm_wakeup_pending()) { wakeup = true; break; } /* * We need to retry, but first give the freezing tasks some * time to enter the refrigerator. Start with an initial * 1 ms sleep followed by exponential backoff until 8 ms. */ usleep_range(sleep_usecs / 2, sleep_usecs); if (sleep_usecs < 8 * USEC_PER_MSEC) sleep_usecs *= 2; } end = ktime_get_boottime(); elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_err("Freezing %s %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_freezable_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; error = __usermodehelper_disable(UMH_FREEZING); if (error) return error; /* Make sure this task doesn't get frozen */ current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) static_branch_inc(&freezer_active); pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); BUG_ON(in_atomic()); /* * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. */ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; if (error) thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * * On success, returns 0. On failure, -errno and only the kernel threads are * thawed, so as to give a chance to the caller to do additional cleanups * (if any) before thawing the userspace tasks. So, it is the responsibility * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { int error; pm_nosig_freezing = true; error = try_to_freeze_tasks(false); BUG_ON(in_atomic()); if (error) thaw_kernel_threads(); return error; } void thaw_processes(void) { struct task_struct *g, *p; struct task_struct *curr = current; trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); curr->flags &= ~PF_SUSPEND_TASK; usermodehelper_enable(); schedule(); pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) { struct task_struct *g, *p; pm_nosig_freezing = false; pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); schedule(); pr_info("Done restarting kernel threads.\n"); }
1 1 1 1 1 18 11 9 12 1 1 13 4 1 43 19 50 48 50 10 10 10 10 6 11 18 46 34 36 70 47 48 12 12 60 60 5 5 2 2 19 8 8 19 5 11 18 18 18 18 18 18 2 1 1 1 4 4 4 2 42 3 42 7 4 4 1 4 28 108 83 83 83 83 5 82 83 1 6 1 1 1 6 50 50 49 51 50 2 51 50 32 59 60 2 60 37 31 4 2 31 30 4 31 2 37 37 59 7 1 7 60 3 60 57 25 36 36 19 25 5 5 5 5 119 3 3 3 3 1 3 1 3 3 1 3 1 1 3 3 3 1 2 1 3 3 7 7 7 7 1 1 1 1 1 44 38 37 29 9 37 38 37 24 24 15 1 7 1 23 23 1 48 45 49 30 30 19 18 18 17 14 4 7 15 11 11 5 36 30 11 36 4 4 4 4 4 4 4 4 4 4 4 7 1 7 1 6 6 6 6 4 5 5 5 5 5 5 1 1 1 2 7 5 5 5 4 4 3 4 3 1 2 11 11 1 1 10 10 7 11 5 6 3 5 3 6 6 3 12 11 8 8 1 7 3 12 5 5 3 3 4 5 5 5 2 4 2 4 5 1 2 2 2 5 5 5 5 5 23 23 22 1 22 22 21 22 22 11 12 12 9 9 2 1 1 1 1 7 1 6 6 1 5 5 3 5 5 5 5 5 5 6 6 17 17 24 11 11 11 11 11 11 11 5 5 5 5 2 1 4 4 4 4 5 3 3 4 2 2 5 5 1 8 8 9 3 62 3 62 54 71 8 8 69 18 64 64 62 61 3 3 1 1 37 8 8 15 7 7 5 9 53 53 49 1 49 7 47 1 47 1 46 6 47 46 46 45 43 6 5 43 1 44 8 39 39 39 1 39 3 3 3 3 2 2 2 1 1 39 1 38 38 30 38 38 6 4 4 4 2 2 2 2 2 2 37 37 1 37 37 36 37 2 2 4 14 21 32 53 10 1 8 9 9 8 8 1 7 1 7 2 3 30 30 30 13 12 29 1 29 1 28 27 18 18 3 15 18 18 17 18 3 3 15 18 18 16 3 17 24 10 7 21 2 4 1 5 5 10 28 15 1 14 14 2 15 12 12 27 1 27 19 19 19 19 15 11 7 7 5 5 1 1 15 1 15 15 15 14 3 15 15 1 15 15 11 5 11 4 4 2 15 15 19 12 11 6 18 4 1 4 3 20 19 18 18 18 17 16 15 15 20 3 3 1 2 2 2 1 2 2 2 2 25 8 4 2 2 1 6 6 4 2 2 4 3 8 5 5 5 5 1 5 5 5 35 34 3 33 33 33 33 8 32 32 25 25 7 24 21 23 20 20 20 20 1 13 21 1 1 1 20 1 21 3 21 12 21 1 21 15 1 15 1 1 15 8 6 1 5 1 5 2 7 26 22 2 2 2 5 29 19 19 2 34 27 29 3 3 4 1 4 12 12 10 12 10 10 1 10 8 10 7 2 3 10 12 5 5 5 2 3 2 1 1 1 1 3 5 4 2 2 2 2 62 1 2 62 2 2 2 2 1 1 1 1 2 14 14 14 1 14 2 13 1 13 14 1 14 1 14 13 14 24 24 24 22 2 24 1 2 1 24 3 24 24 3 24 15 13 13 4 3 13 11 24 4 1 4 5 5 4 5 5 4 4 1 4 5 5 5 5 4 5 5 5 4 4 4 4 2 3 1 3 3 4 4 9 9 9 8 9 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bpf-cgroup.h> #include <linux/btf_ids.h> #include <linux/dcache.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/filter.h> #include <linux/fs.h> #include <linux/fs_struct.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/net.h> #include <linux/pidfs.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/sched/signal.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/splice.h> #include <linux/string.h> #include <linux/uaccess.h> #include <net/af_unix.h> #include <net/net_namespace.h> #include <net/scm.h> #include <net/tcp_states.h> #include <uapi/linux/sockios.h> #include <uapi/linux/termios.h> #include "af_unix.h" static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock. */ #ifdef CONFIG_PROVE_LOCKING #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) static int unix_table_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { return cmp_ptr(a, b); } static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct unix_sock *a, *b; a = container_of(_a, struct unix_sock, lock.dep_map); b = container_of(_b, struct unix_sock, lock.dep_map); if (a->sk.sk_state == TCP_LISTEN) { /* unix_stream_connect(): Before the 2nd unix_state_lock(), * * 1. a is TCP_LISTEN. * 2. b is not a. * 3. concurrent connect(b -> a) must fail. * * Except for 2. & 3., the b's state can be any possible * value due to concurrent connect() or listen(). * * 2. is detected in debug_spin_lock_before(), and 3. cannot * be expressed as lock_cmp_fn. */ switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: case TCP_LISTEN: return -1; default: /* Invalid case. */ return 0; } } /* Should never happen. Just to be symmetric. */ if (b->sk.sk_state == TCP_LISTEN) { switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: return 1; default: return 0; } } /* unix_state_double_lock(): ascending address order. */ return cmp_ptr(a, b); } static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct sock *a, *b; a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); /* unix_collect_skb(): listener -> embryo order. */ if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) return -1; /* Should never happen. Just to be symmetric. */ if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) return 1; return 0; } #endif static unsigned int unix_unbound_hash(struct sock *sk) { unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; hash ^= sk->sk_type; return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) { return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, int addr_len, int type) { __wsum csum = csum_partial(sunaddr, addr_len, 0); unsigned int hash; hash = (__force unsigned int)csum_fold(csum); hash ^= hash >> 8; hash ^= type; return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_lock(&net->unx.table.locks[hash1]); return; } if (hash1 > hash2) swap(hash1, hash2); spin_lock(&net->unx.table.locks[hash1]); spin_lock(&net->unx.table.locks[hash2]); } static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_unlock(&net->unx.table.locks[hash1]); return; } spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { scm->secid = UNIXCB(skb).secid; } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return true; } #endif /* CONFIG_SECURITY_NETWORK */ static inline int unix_may_send(struct sock *sk, struct sock *osk) { return !unix_peer(osk) || unix_peer(osk) == sk; } static inline int unix_recvq_full_lockless(const struct sock *sk) { return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) { struct sock *peer; unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); unix_state_unlock(s); return peer; } EXPORT_SYMBOL_GPL(unix_peer_get); static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, int addr_len) { struct unix_address *addr; addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); if (!addr) return NULL; refcount_set(&addr->refcnt, 1); addr->len = addr_len; memcpy(addr->name, sunaddr, addr_len); return addr; } static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) kfree(addr); } /* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) { if (addr_len <= offsetof(struct sockaddr_un, sun_path) || addr_len > sizeof(*sunaddr)) return -EINVAL; if (sunaddr->sun_family != AF_UNIX) return -EINVAL; return 0; } static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) { struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data); BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); /* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel(). */ addr->__data[addr_len - offset] = 0; /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer. */ return strlen(addr->__data) + offset + 1; } static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); } static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } static void __unix_set_addr_hash(struct net *net, struct sock *sk, struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_bsd_socket(struct sock *sk) { spin_lock(&bsd_socket_locks[sk->sk_hash]); sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); spin_unlock(&bsd_socket_locks[sk->sk_hash]); } static void unix_remove_bsd_socket(struct sock *sk) { if (!hlist_unhashed(&sk->sk_bind_node)) { spin_lock(&bsd_socket_locks[sk->sk_hash]); __sk_del_bind_node(sk); spin_unlock(&bsd_socket_locks[sk->sk_hash]); sk_node_init(&sk->sk_bind_node); } } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; } return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&bsd_socket_locks[hash]); sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&bsd_socket_locks[hash]); return s; } } spin_unlock(&bsd_socket_locks[hash]); return NULL; } /* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed. */ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; wait_queue_head_t *u_sleep; u = container_of(q, struct unix_sock, peer_wake); __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, q); u->peer_wake.private = NULL; /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; int rc; u = unix_sk(sk); u_other = unix_sk(other); rc = 0; spin_lock(&u_other->peer_wait.lock); if (!u->peer_wake.private) { u->peer_wake.private = other; __add_wait_queue(&u_other->peer_wait, &u->peer_wake); rc = 1; } spin_unlock(&u_other->peer_wait.lock); return rc; } static void unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; u = unix_sk(sk); u_other = unix_sk(other); spin_lock(&u_other->peer_wait.lock); if (u->peer_wake.private == other) { __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); u->peer_wake.private = NULL; } spin_unlock(&u_other->peer_wait.lock); } static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, struct sock *other) { unix_dgram_peer_wake_disconnect(sk, other); wake_up_interruptible_poll(sk_sleep(sk), EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } /* preconditions: * - unix_peer(sk) == other * - association is stable */ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) { int connected; connected = unix_dgram_peer_wake_connect(sk, other); /* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) unix_dgram_peer_wake_disconnect(sk, other); return 0; } static int unix_writable(const struct sock *sk, unsigned char state) { return state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); } static void unix_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_UNIX_DISCONNECT); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us. */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { WRITE_ONCE(other->sk_err, ECONNRESET); sk_error_report(other); } } } static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; } if (u->addr) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } static unsigned int unix_skb_len(const struct sk_buff *skb) { return skb->len - UNIXCB(skb).consumed; } static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct sock *skpair; struct sk_buff *skb; struct path path; int state; unix_remove_socket(sock_net(sk), sk); unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); sock_orphan(sk); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) u->oob_skb = NULL; #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb && !unix_skb_len(skb)) skb = skb_peek_next(skb, &sk->sk_receive_queue); #endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ } /* Try to flush out this socket. Throw out buffers at least */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) path_put(&path); sock_put(sk); /* ---- Socket is dead now and most probably destroyed ---- */ unix_schedule_gc(NULL); } struct unix_peercred { struct pid *peer_pid; const struct cred *peer_cred; }; static inline int prepare_peercred(struct unix_peercred *peercred) { struct pid *pid; int err; pid = task_tgid(current); err = pidfs_register_pid(pid); if (likely(!err)) { peercred->peer_pid = get_pid(pid); peercred->peer_cred = get_current_cred(); } return err; } static void drop_peercred(struct unix_peercred *peercred) { const struct cred *cred = NULL; struct pid *pid = NULL; might_sleep(); swap(peercred->peer_pid, pid); swap(peercred->peer_cred, cred); put_pid(pid); put_cred(cred); } static inline void init_peercred(struct sock *sk, const struct unix_peercred *peercred) { sk->sk_peer_pid = peercred->peer_pid; sk->sk_peer_cred = peercred->peer_cred; } static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); peercred->peer_pid = old_pid; peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) { lockdep_assert_held(&unix_sk(peersk)->lock); spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } static bool unix_may_passcred(const struct sock *sk) { return sk->sk_scm_credentials || sk->sk_scm_pidfd; } static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ err = prepare_peercred(&peercred); if (err) goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); drop_peercred(&peercred); out: return err; } static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr_unsized *, int); static int unix_stream_connect(struct socket *, struct sockaddr_unsized *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); #ifdef CONFIG_PROC_FS static int unix_count_nr_fds(struct sock *sk) { struct sk_buff *skb; struct unix_sock *u; int nr_fds = 0; spin_lock(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); while (skb) { u = unix_sk(skb->sk); nr_fds += atomic_read(&u->scm_stat.nr_fds); skb = skb_peek_next(skb, &sk->sk_receive_queue); } spin_unlock(&sk->sk_receive_queue.lock); return nr_fds; } static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; unsigned char s_state; struct unix_sock *u; int nr_fds = 0; if (sk) { s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed. */ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else #define unix_show_fdinfo NULL #endif static bool unix_custom_sockopt(int optname) { switch (optname) { case SO_INQ: return true; default: return false; } } static int unix_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct unix_sock *u = unix_sk(sock->sk); struct sock *sk = sock->sk; int val; if (level != SOL_SOCKET) return -EOPNOTSUPP; if (!unix_custom_sockopt(optname)) return sock_setsockopt(sock, level, optname, optval, optlen); if (optlen != sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case SO_INQ: if (sk->sk_type != SOCK_STREAM) return -EINVAL; if (val > 1 || val < 0) return -EINVAL; WRITE_ONCE(u->recvmsg_inq, val); break; default: return -ENOPROTOOPT; } return 0; } static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .setsockopt = unix_setsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .splice_read = unix_stream_splice_read, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_dgram_connect, .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static void unix_close(struct sock *sk, long timeout) { /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap. */ } static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: return true; default: return false; } } return false; } struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif }; struct proto unix_stream_proto = { .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct unix_sock *u; struct sock *sk; int err; atomic_long_inc(&unix_nr_socks); if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { err = -ENFILE; goto err; } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); if (!sk) { err = -ENOMEM; goto err; } sock_init_data(sock, sk); sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); u = unix_sk(sk); u->listener = NULL; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; err: atomic_long_dec(&unix_nr_socks); return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; switch (sock->type) { case SOCK_STREAM: set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ case SOCK_RAW: sock->type = SOCK_DGRAM; fallthrough; case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk); return 0; } static int unix_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int type, int flags) { struct inode *inode; struct path path; struct sock *sk; int err; unix_mkname_bsd(sunaddr, addr_len); if (flags & SOCK_COREDUMP) { struct path root; task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); scoped_with_kernel_creds() err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_MAGICLINKS, &path); path_put(&root); if (err) goto fail; } else { err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; err = path_permission(&path, MAY_WRITE); if (err) goto path_put; } err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); if (!S_ISSOCK(inode->i_mode)) goto path_put; sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; err = -EPROTOTYPE; if (sk->sk_type != type) goto sock_put; err = security_unix_find(&path, sk, flags); if (err) goto sock_put; touch_atime(&path); path_put(&path); return sk; sock_put: sock_put(sk); path_put: path_put(&path); fail: return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) touch_atime(&unix_sk(sk)->path); return sk; } static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type, int flags) { struct sock *sk; if (sunaddr->sun_path[0]) sk = unix_find_bsd(sunaddr, addr_len, type, flags); else sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } static int unix_autobind(struct sock *sk) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) goto out; err = -ENOMEM; addr = kzalloc(sizeof(*addr) + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); if (ordernum == lastnum) { /* Give up if all names seems to be in use. */ err = -ENOSPC; unix_release_addr(addr); goto out; } goto retry; } __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); return err; } static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; int err; addr_len = unix_mkname_bsd(sunaddr, addr_len); addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; /* * Get the parent directory, calculate the hash for last * component. */ dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; } /* * All right, let's create it. */ idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock; old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); end_creating_path(&parent, dentry); return 0; out_unlock: mutex_unlock(&u->bindlock); err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; } static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; if (u->addr) { err = -EINVAL; goto out_mutex; } old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); out: unix_release_addr(addr); return err; } static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); err = unix_validate_addr(sunaddr, addr_len); if (err) return err; if (sunaddr->sun_path[0]) err = unix_bind_bsd(sk, sunaddr, addr_len); else err = unix_bind_abstract(sk, sunaddr, addr_len); return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_lock(sk1); return; } if (sk1 > sk2) swap(sk1, sk2); unix_state_lock(sk1); unix_state_lock(sk2); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_unlock(sk1); return; } unix_state_unlock(sk1); unix_state_unlock(sk2); } static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *sk = sock->sk; struct sock *other; int err; err = -EINVAL; if (alen < offsetofend(struct sockaddr, sa_family)) goto out; if (addr->sa_family != AF_UNSPEC) { err = unix_validate_addr(sunaddr, alen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); if (err) goto out; if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; } restart: other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; } unix_state_double_lock(sk, other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_double_unlock(sk, other); sock_put(other); goto restart; } err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; unix_state_double_lock(sk, other); } /* * If it was connected, reconnect. */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; if (!other) WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); unix_state_lock(old_peer); if (!unix_peer(old_peer)) WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); unix_state_unlock(old_peer); } sock_put(old_peer); } else { unix_peer(sk) = other; unix_state_double_unlock(sk, other); } return 0; out_unlock: unix_state_double_unlock(sk, other); sock_put(other); out: return err; } static long unix_wait_for_peer(struct sock *other, long timeo) { struct unix_sock *u = unix_sk(other); int sched; DEFINE_WAIT(wait); prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && unix_recvq_full_lockless(other); unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); finish_wait(&u->peer_wait, &wait); return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; long timeo; int err; err = unix_validate_addr(sunaddr, addr_len); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); if (err) goto out; if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); err = prepare_peercred(&peercred); if (err) goto out; /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); goto out; } /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { err = -ENOMEM; goto out_free_sk; } restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free_skb; } unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_unlock(other); sock_put(other); goto restart; } if (other->sk_state != TCP_LISTEN || other->sk_shutdown & RCV_SHUTDOWN) { err = -ECONNREFUSED; goto out_unlock; } if (unix_recvq_full_lockless(other)) { if (!timeo) { err = -EAGAIN; goto out_unlock; } timeo = unix_wait_for_peer(other, timeo); sock_put(other); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free_skb; goto restart; } /* self connect and simultaneous connect are eliminated * by rejecting TCP_LISTEN socket to avoid deadlock. */ state = READ_ONCE(sk->sk_state); if (unlikely(state != TCP_CLOSE)) { err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; goto out_unlock; } unix_state_lock(sk); if (unlikely(sk->sk_state != TCP_CLOSE)) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; unix_state_unlock(sk); goto out_unlock; } err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; } /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; init_peercred(newsk, &peercred); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); /* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind(). */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } refcount_inc(&otheru->addr->refcnt); smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); sock->state = SS_CONNECTED; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); READ_ONCE(other->sk_data_ready)(other); sock_put(other); return 0; out_unlock: unix_state_unlock(other); sock_put(other); out_free_skb: consume_skb(skb); out_free_sk: unix_release_sock(newsk, 0); out: drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; int err; err = prepare_peercred(&ska_peercred); if (err) return err; err = prepare_peercred(&skb_peercred); if (err) { drop_peercred(&ska_peercred); return err; } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; init_peercred(ska, &ska_peercred); init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; socka->state = SS_CONNECTED; sockb->state = SS_CONNECTED; return 0; } static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sock *tsk; arg->err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; arg->err = -EINVAL; if (READ_ONCE(sk->sk_state) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, &arg->err); if (!skb) { /* This means receive shutdown. */ if (arg->err == 0) arg->err = -EINVAL; goto out; } tsk = skb->sk; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); if (tsk->sk_type == SOCK_STREAM) set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; out: return arg->err; } static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { sk = unix_peer_get(sk); err = -ENOTCONN; if (!sk) goto out; err = 0; } else { sock_hold(sk); } addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); if (peer) BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETPEERNAME); else BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: return err; } /* The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough. */ static inline bool too_many_unix_fds(struct task_struct *p) { struct user_struct *user = current_user(); if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); return false; } static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { if (too_many_unix_fds(current)) return -ETOOMANYREFS; UNIXCB(skb).fp = scm->fp; scm->fp = NULL; if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; return 0; } static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); unix_peek_fpl(scm->fp); } static void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm = {}; swap(scm.pid, UNIXCB(skb).pid); if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); scm_destroy(&scm); } static void unix_wfree(struct sk_buff *skb) { unix_destruct_scm(skb); sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_wfree; return err; } static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) { scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(scm, skb); } /** * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. * @skb: skb to attach creds to. * @sk: Sender sock. * @other: Receiver sock. * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. * * Context: May sleep. * Return: On success zero, on error a negative error code is returned. */ static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, const struct sock *other) { if (UNIXCB(skb).pid) return 0; if (unix_may_passcred(sk) || unix_may_passcred(other) || !other->sk_socket) { struct pid *pid; int err; pid = task_tgid(current); err = pidfs_register_pid(pid); if (unlikely(err)) return err; UNIXCB(skb).pid = get_pid(pid); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } return 0; } static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { return UNIXCB(skb).pid == scm->pid && uid_eq(UNIXCB(skb).uid, scm->creds.uid) && gid_eq(UNIXCB(skb).gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } static void scm_stat_add(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); unix_add_edges(fp, u); } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); unix_del_edges(fp); } } static void unix_orphan_scm(struct sock *sk, struct sk_buff *skb) { scm_stat_del(sk, skb); unix_destruct_scm(skb); skb->destructor = sock_wfree; } /* * Send AF_UNIX data. */ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; struct sk_buff *skb; int data_len = 0; int sk_locked; long timeo; int err; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } if (msg->msg_namelen) { err = unix_validate_addr(msg->msg_name, msg->msg_namelen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen, NULL); if (err) goto out; } if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } if (len > READ_ONCE(sk->sk_sndbuf) - 32) { err = -EMSGSIZE; goto out; } if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); data_len = PAGE_ALIGN(data_len); BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) goto out; err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (msg->msg_namelen) { lookup: other = unix_find_other(sock_net(sk), msg->msg_name, msg->msg_namelen, sk->sk_type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free; } } else { other = unix_peer_get(sk); if (!other) { err = -ENOTCONN; goto out_free; } } if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; goto out_sock_put; } err = unix_maybe_add_creds(skb, sk, other); if (err) goto out_sock_put; restart: sk_locked = 0; unix_state_lock(other); restart_locked: if (!unix_may_send(sk, other)) { err = -EPERM; goto out_unlock; } if (unlikely(sock_flag(other, SOCK_DEAD))) { /* Check with 1003.1g - what should datagram error */ unix_state_unlock(other); if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ err = -EPIPE; goto out_sock_put; } if (!sk_locked) unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; goto out_sock_put; } unix_state_unlock(sk); if (!msg->msg_namelen) { err = -ECONNRESET; goto out_sock_put; } sock_put(other); goto lookup; } if (other->sk_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out_unlock; } if (UNIXCB(skb).fp && !other->sk_scm_rights) { err = -EPERM; goto out_unlock; } if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; } /* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && unlikely(unix_peer(other) != sk && unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_sock_put; goto restart; } if (!sk_locked) { unix_state_unlock(other); unix_state_double_lock(sk, other); } if (unix_peer(sk) != other || unix_dgram_peer_wake_me(sk, other)) { err = -EAGAIN; sk_locked = 1; goto out_unlock; } if (!sk_locked) { sk_locked = 1; goto restart_locked; } } if (unlikely(sk_locked)) unix_state_unlock(sk); if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); READ_ONCE(other->sk_data_ready)(other); sock_put(other); scm_destroy(&scm); return len; out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); out_sock_put: sock_put(other); out_free: consume_skb(skb); out: scm_destroy(&scm); return err; } /* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err; skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); if (err < 0) goto out; err = unix_maybe_add_creds(skb, sk, other); if (err) goto out; skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); if (err) goto out; unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; goto out_unlock; } if (UNIXCB(skb).fp && !other->sk_scm_rights) { err = -EPERM; goto out_unlock; } scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); sk_send_sigurg(other); unix_state_unlock(other); READ_ONCE(other->sk_data_ready)(other); return 0; out_unlock: unix_state_unlock(other); out: consume_skb(skb); return err; } #endif static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; struct unix_sock *otheru; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else #endif goto out_err; } if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } other = unix_peer(sk); if (!other) { err = -ENOTCONN; goto out_err; } otheru = unix_sk(other); if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; while (sent < len) { int size = len - sent; int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, msg->msg_flags & MSG_DONTWAIT, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); } if (!skb) goto out_err; /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) goto out_free; fds_sent = true; err = unix_maybe_add_creds(skb, sk, other); if (err) goto out_free; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size); if (err < 0) goto out_free; size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) goto out_free; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; if (UNIXCB(skb).fp && !other->sk_scm_rights) { unix_state_unlock(other); err = -EPERM; goto out_free; } scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); READ_ONCE(other->sk_data_ready)(other); sent += size; } #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sk, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; } #endif scm_destroy(&scm); return sent; out_pipe_unlock: unix_state_unlock(other); out_pipe: if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_free: consume_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk = sock->sk; err = sock_error(sk); if (err) return err; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) msg->msg_namelen = 0; return unix_dgram_sendmsg(sock, msg, len); } static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (addr) { msg->msg_namelen = addr->len; memcpy(msg->msg_name, addr->name, addr->len); } } int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct scm_cookie scm; struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; int skip; int err; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, &skip, &err, &last); if (skb) { if (!(flags & MSG_PEEK)) scm_stat_del(sk, skb); break; } mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) err = 0; goto out; } if (wq_has_sleeper(&u->peer_wait)) wake_up_interruptible_sync_poll(&u->peer_wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (msg->msg_name) { unix_copy_addr(msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen); } if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); memset(&scm, 0, sizeof(scm)); unix_skb_to_scm(skb, &scm); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however! */ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv_unix(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); mutex_unlock(&u->iolock); out: return err; } static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) return prot->recvmsg(sk, msg, size, flags); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); if (!skb) { mutex_unlock(&u->iolock); return err; } unix_orphan_scm(sk, skb); mutex_unlock(&u->iolock); return recv_actor(sk, skb); } /* * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); struct socket *socket; struct msghdr *msg; struct pipe_inode_info *pipe; size_t size; int flags; unsigned int splice_flags; }; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; mutex_lock(&u->iolock); unix_state_lock(sk); spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; } oob_skb = u->oob_skb; if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); WRITE_ONCE(u->inq_len, u->inq_len - 1); if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && !unix_skb_len(oob_skb->prev)) { read_skb = oob_skb->prev; __skb_unlink(read_skb, &sk->sk_receive_queue); } } spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; mutex_unlock(&u->iolock); consume_skb(read_skb); if (chunk < 0) return -EFAULT; state->msg->msg_flags |= MSG_OOB; return 1; } static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { struct sk_buff *read_skb = NULL, *unread_skb = NULL; struct unix_sock *u = unix_sk(sk); if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) return skb; spin_lock(&sk->sk_receive_queue.lock); if (!unix_skb_len(skb)) { if (copied && (!u->oob_skb || skb == u->oob_skb)) { skb = NULL; } else if (flags & MSG_PEEK) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } else { read_skb = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); __skb_unlink(read_skb, &sk->sk_receive_queue); } if (!skb) goto unlock; } if (skb != u->oob_skb) goto unlock; if (copied) { skb = NULL; } else if (!(flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); if (!sock_flag(sk, SOCK_URGINLINE)) { __skb_unlink(skb, &sk->sk_receive_queue); unread_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } } else if (!sock_flag(sk, SOCK_URGINLINE)) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } #endif static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff_head *queue = &sk->sk_receive_queue; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; err = sock_error(sk); if (err) return err; mutex_lock(&u->iolock); spin_lock(&queue->lock); skb = __skb_dequeue(queue); if (!skb) { spin_unlock(&queue->lock); mutex_unlock(&u->iolock); return -EAGAIN; } WRITE_ONCE(u->inq_len, u->inq_len - skb->len); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb == u->oob_skb) { WRITE_ONCE(u->oob_skb, NULL); spin_unlock(&queue->lock); mutex_unlock(&u->iolock); kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } #endif spin_unlock(&queue->lock); unix_orphan_scm(sk, skb); mutex_unlock(&u->iolock); return recv_actor(sk, skb); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { int noblock = state->flags & MSG_DONTWAIT; struct socket *sock = state->socket; struct msghdr *msg = state->msg; struct sock *sk = sock->sk; size_t size = state->size; int flags = state->flags; bool check_creds = false; struct scm_cookie scm; struct unix_sock *u; int copied = 0; int err = 0; long timeo; int target; int skip; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) err = unix_stream_recv_urg(state); #endif goto out; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); memset(&scm, 0, sizeof(scm)); u = unix_sk(sk); redo: /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ mutex_lock(&u->iolock); skip = max(sk_peek_offset(sk, flags), 0); do { struct sk_buff *skb, *last; int chunk; unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); if (!skb && copied) { unix_state_unlock(sk); break; } } #endif if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; break; } mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); scm_destroy(&scm); goto out; } goto redo; unlock: unix_state_unlock(sk); break; } while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; } unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; } else if (unix_may_passcred(sk)) { /* Copy credentials */ unix_skb_to_scm(skb, &scm); check_creds = true; } /* Copy address just once */ if (msg && msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); unix_copy_addr(msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); chunk = state->recv_actor(skb, skip, chunk, state); if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) { scm_stat_del(sk, skb); unix_detach_fds(&scm, skb); } if (unix_skb_len(skb)) break; spin_lock(&sk->sk_receive_queue.lock); WRITE_ONCE(u->inq_len, u->inq_len - skb->len); __skb_unlink(skb, &sk->sk_receive_queue); spin_unlock(&sk->sk_receive_queue.lock); consume_skb(skb); if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; skip = 0; last = skb; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) goto again; unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->iolock); if (msg) { bool do_cmsg = READ_ONCE(u->recvmsg_inq); scm_recv_unix(sock, msg, &scm, flags); if ((do_cmsg | msg->msg_get_inq) && (copied ?: err) >= 0) { msg->msg_inq = READ_ONCE(u->inq_len); if (do_cmsg) put_cmsg(msg, SOL_SOCKET, SCM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } else { scm_destroy(&scm); } out: return copied ? : err; } static int unix_stream_read_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { int ret; ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, state->msg, chunk); return ret ?: chunk; } int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sk->sk_socket, .msg = msg, .size = size, .flags = flags }; return unix_stream_read_generic(&state, true); } static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sock, .msg = msg, .size = size, .flags = flags }; #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) return prot->recvmsg(sk, msg, size, flags); #endif return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t size, unsigned int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_splice_actor, .socket = sock, .pipe = pipe, .size = size, .splice_flags = flags, }; if (unlikely(*ppos)) return -ESPIPE; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; struct sock *other; if (mode < SHUT_RD || mode > SHUT_RDWR) return -EINVAL; /* This maps: * SHUT_RD (0) -> RCV_SHUTDOWN (1) * SHUT_WR (1) -> SEND_SHUTDOWN (2) * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) */ ++mode; unix_state_lock(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); unix_state_unlock(sk); sk->sk_state_change(sk); if (other && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); if (prot->unhash) prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } if (other) sock_put(other); return 0; } long unix_inq_len(struct sock *sk) { struct sk_buff *skb; long amount = 0; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; if (sk->sk_type == SOCK_STREAM) return READ_ONCE(unix_sk(sk)->inq_len); spin_lock(&sk->sk_receive_queue.lock); if (sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; } spin_unlock(&sk->sk_receive_queue.lock); return amount; } EXPORT_SYMBOL_GPL(unix_inq_len); long unix_outq_len(struct sock *sk) { return sk_wmem_alloc_get(sk); } EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; if (!unix_sk(sk)->path.dentry) return -ENOENT; return FD_ADD(O_CLOEXEC, dentry_open(&unix_sk(sk)->path, O_PATH, current_cred())); } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount = 0; int err; switch (cmd) { case SIOCOUTQ: amount = unix_outq_len(sk); err = put_user(amount, (int __user *)arg); break; case SIOCINQ: amount = unix_inq_len(sk); if (amount < 0) err = amount; else err = put_user(amount, (int __user *)arg); break; case SIOCUNIXFILE: err = unix_open_file(sk); break; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int answ = 0; if (sk->sk_type != SOCK_STREAM) return -EOPNOTSUPP; mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); struct sk_buff *next_skb; next_skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb == oob_skb || (!unix_skb_len(skb) && (!oob_skb || next_skb == oob_skb))) answ = 1; } mutex_unlock(&u->iolock); err = put_user(answ, (int __user *)arg); } break; #endif default: err = -ENOIOCTLCMD; break; } return err; } #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (READ_ONCE(unix_sk(sk)->oob_skb)) mask |= EPOLLPRI; #endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; unsigned int writable; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); other = unix_peer(sk); if (other && unix_peer(other) != sk && unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; unix_state_unlock(sk); } if (writable) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } #ifdef CONFIG_PROC_FS #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); unsigned long count = 0; struct sock *sk; for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); sk; sk = sk_next(sk)) { if (++count == offset) break; } return sk; } static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; spin_unlock(&net->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } return NULL; } static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); sk = sk_next(sk); if (sk) return sk; spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); return unix_get_first(seq, pos); } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (v == SEQ_START_TOKEN) return unix_get_first(seq, pos); return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); unix_state_lock(s); seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5llu", s, refcount_read(&s->sk_refcnt), 0, s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->sk_type, s->sk_socket ? (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); if (u->addr->name->sun_path[0]) { len--; } else { seq_putc(seq, '@'); i++; } for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i] ?: '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, .show = unix_seq_show, }; #ifdef CONFIG_BPF_SYSCALL struct bpf_unix_iter_state { struct seq_net_private p; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; struct sock **batch; bool st_bucket_done; }; struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); uid_t uid __aligned(8); }; static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) { struct bpf_iter__unix ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.unix_sk = unix_sk; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; } expected++; } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); return expected; } static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) { while (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, unsigned int new_batch_sz) { struct sock **new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER | __GFP_NOWARN); if (!new_batch) return -ENOMEM; bpf_iter_unix_put_batch(iter); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } static struct sock *bpf_iter_unix_batch(struct seq_file *seq, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected; bool resized = false; struct sock *sk; if (iter->st_bucket_done) *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); again: /* Get a new batch */ iter->cur_sk = 0; iter->end_sk = 0; sk = unix_get_first(seq, pos); if (!sk) return NULL; /* Done */ expected = bpf_iter_unix_hold_batch(seq, sk); if (iter->end_sk == expected) { iter->st_bucket_done = true; return sk; } if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { resized = true; goto again; } return sk; } static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ return bpf_iter_unix_batch(seq, pos); } static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so advance to the next sk in * the batch. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); ++*pos; if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk]; else sk = bpf_iter_unix_batch(seq, pos); return sk; } static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; int ret; if (v == SEQ_START_TOKEN) return 0; lock_sock(sk); unix_state_lock(sk); if (unlikely(sock_flag(sk, SOCK_DEAD))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: unix_state_unlock(sk); release_sock(sk); return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)unix_prog_seq_show(prog, &meta, v, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { .start = bpf_iter_unix_seq_start, .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; #endif #endif static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, }; static int __net_init unix_net_init(struct net *net) { int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) goto out; #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, sizeof(struct seq_net_private))) goto err_sysctl; #endif net->unx.table.locks = kvmalloc_objs(spinlock_t, UNIX_HASH_SIZE); if (!net->unx.table.locks) goto err_proc; net->unx.table.buckets = kvmalloc_objs(struct hlist_head, UNIX_HASH_SIZE); if (!net->unx.table.buckets) goto free_locks; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock_init(&net->unx.table.locks[i]); lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } return 0; free_locks: kvfree(net->unx.table.locks); err_proc: #ifdef CONFIG_PROC_FS remove_proc_entry("unix", net->proc_net); err_sysctl: #endif unix_sysctl_unregister(net); out: return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { kvfree(net->unx.table.buckets); kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) #define INIT_BATCH_SZ 16 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_unix_iter_state *iter = priv_data; int err; err = bpf_iter_init_seq_net(priv_data, aux); if (err) return err; err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); if (err) { bpf_iter_fini_seq_net(priv_data); return err; } return 0; } static void bpf_iter_fini_unix(void *priv_data) { struct bpf_unix_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, .init_seq_private = bpf_iter_init_unix, .fini_seq_private = bpf_iter_fini_unix, .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static const struct bpf_func_proto * bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sk_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sk_getsockopt_proto; default: return NULL; } } static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; static void __init bpf_iter_register(void) { unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; if (bpf_iter_reg_target(&unix_reg_info)) pr_warn("Warning: could not register bpf iterator unix\n"); } #endif static int __init af_unix_init(void) { int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { spin_lock_init(&bsd_socket_locks[i]); INIT_HLIST_HEAD(&bsd_socket_buckets[i]); } rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; } rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); proto_unregister(&unix_dgram_proto); goto out; } sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif out: return rc; } /* Later than subsys_initcall() because we depend on stuff initialised there */ fs_initcall(af_unix_init);
1 1 1 1 1 1 1 1 1 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "irq: " fmt #include <linux/acpi.h> #include <linux/debugfs.h> #include <linux/hardirq.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/fs.h> static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static struct irq_domain *irq_default_domain; static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq); struct irqchip_fwid { struct fwnode_handle fwnode; struct fwnode_handle *parent; unsigned int type; char *name; phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS static void debugfs_add_domain_dir(struct irq_domain *d); static void debugfs_remove_domain_dir(struct irq_domain *d); #else static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); return fwid->name; } static struct fwnode_handle *irqchip_fwnode_get_parent(const struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); return fwid->parent; } const struct fwnode_operations irqchip_fwnode_ops = { .get_name = irqchip_fwnode_get_name, .get_parent = irqchip_fwnode_get_parent, }; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @id: Optional user provided id if name != NULL * @name: Optional user provided domain name * @pa: Optional user-provided physical address * @parent: Optional parent fwnode_handle * * Allocate a struct irqchip_fwid, and return a pointer to the embedded * fwnode_handle (or NULL on failure). * * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are * solely to transport name information to irqdomain creation code. The * node is not stored. For other types the pointer is kept in the irq * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, const char *name, phys_addr_t *pa, struct fwnode_handle *parent) { struct irqchip_fwid *fwid; char *n; fwid = kzalloc_obj(*fwid); switch (type) { case IRQCHIP_FWNODE_NAMED: n = kasprintf(GFP_KERNEL, "%s", name); break; case IRQCHIP_FWNODE_NAMED_ID: n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } if (!fwid || !n) { kfree(fwid); kfree(n); return NULL; } fwid->type = type; fwid->name = n; fwid->pa = pa; fwid->parent = parent; fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle * @fwnode: fwnode_handle to free * * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. */ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token) { if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "%s", base); else domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token, const char *suffix) { const char *sep = suffix ? "-" : ""; const char *suf = suffix ? : ""; char *name; if (bus_token == DOMAIN_BUS_ANY) name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf); else name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token); if (!name) return -ENOMEM; /* * fwnode paths contain '/', which debugfs is legitimately unhappy * about. Replace them with ':', which does the trick and is not as * offensive as '\'... */ domain->name = strreplace(name, '/', ':'); domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { static atomic_t unknown_domains; int id = atomic_inc_return(&unknown_domains); if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id); else domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info) { enum irq_domain_bus_token bus_token = info->bus_token; const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { const struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); /* * The name_suffix is only intended to be used to avoid a name * collision when multiple domains are created for a single * device and the name is picked using a real device node. * (Typical use-case is regmap-IRQ controllers for devices * providing more than one physical IRQ.) There should be no * need to use name_suffix with irqchip-fwnode. */ if (info->name_suffix) return -EINVAL; switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: return alloc_name(domain, fwid->name, bus_token); default: domain->name = fwid->name; if (bus_token != DOMAIN_BUS_ANY) return alloc_name(domain, fwid->name, bus_token); } } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) { return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix); } if (domain->name) return 0; if (fwnode) pr_err("Invalid fwnode type for irqdomain\n"); return alloc_unknown_name(domain, bus_token); } static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) { struct irq_domain *domain; int err; if (WARN_ON((info->size && info->direct_max) || (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) || (info->direct_max && info->direct_max != info->hwirq_max))) return ERR_PTR(-EINVAL); domain = kzalloc_node(struct_size(domain, revmap, info->size), GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode))); if (!domain) return ERR_PTR(-ENOMEM); err = irq_domain_set_name(domain, info); if (err) { kfree(domain); return ERR_PTR(err); } domain->fwnode = fwnode_handle_get(info->fwnode); fwnode_dev_initialized(domain->fwnode, true); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = info->ops; domain->host_data = info->host_data; domain->bus_token = info->bus_token; domain->hwirq_max = info->hwirq_max; if (info->direct_max) domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; domain->revmap_size = info->size; /* * Hierarchical domains use the domain lock of the root domain * (innermost domain). * * For non-hierarchical domains (as for root domains), the root * pointer is set to the domain itself so that &domain->root->mutex * always points to the right lock. */ mutex_init(&domain->mutex); domain->root = domain; irq_domain_check_hierarchy(domain); return domain; } static void __irq_domain_publish(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); pr_debug("Added domain %s\n", domain->name); } static void irq_domain_free(struct irq_domain *domain) { fwnode_dev_initialized(domain->fwnode, false); fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); } static void irq_domain_instantiate_descs(const struct irq_domain_info *info) { if (!IS_ENABLED(CONFIG_SPARSE_IRQ)) return; if (irq_alloc_descs(info->virq_base, info->virq_base, info->size, of_node_to_nid(to_of_node(info->fwnode))) < 0) { pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", info->virq_base); } } static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info, bool cond_alloc_descs, bool force_associate) { struct irq_domain *domain; int err; domain = __irq_domain_create(info); if (IS_ERR(domain)) return domain; domain->flags |= info->domain_flags; domain->exit = info->exit; domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { domain->root = info->parent->root; domain->parent = info->parent; } #endif if (info->dgc_info) { err = irq_domain_alloc_generic_chips(domain, info->dgc_info); if (err) goto err_domain_free; } if (info->init) { err = info->init(domain); if (err) goto err_domain_gc_remove; } __irq_domain_publish(domain); if (cond_alloc_descs && info->virq_base > 0) irq_domain_instantiate_descs(info); /* * Legacy interrupt domains have a fixed Linux interrupt number * associated. Other interrupt domains can request association by * providing a Linux interrupt number > 0. */ if (force_associate || info->virq_base > 0) { irq_domain_associate_many(domain, info->virq_base, info->hwirq_base, info->size - info->hwirq_base); } return domain; err_domain_gc_remove: if (info->dgc_info) irq_domain_remove_generic_chips(domain); err_domain_free: irq_domain_free(domain); return ERR_PTR(err); } /** * irq_domain_instantiate() - Instantiate a new irq domain data structure * @info: Domain information pointer pointing to the information for this domain * * Return: A pointer to the instantiated irq domain or an ERR_PTR value. */ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) { return __irq_domain_instantiate(info, false, false); } EXPORT_SYMBOL_GPL(irq_domain_instantiate); /** * irq_domain_remove() - Remove an irq domain. * @domain: domain to remove * * This routine is used to remove an irq domain. The caller must ensure * that all mappings within the domain have been disposed of prior to * use, depending on the revmap type. */ void irq_domain_remove(struct irq_domain *domain) { if (domain->exit) domain->exit(domain); mutex_lock(&irq_domain_mutex); debugfs_remove_domain_dir(domain); WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); /* * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC) irq_domain_remove_generic_chips(domain); pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); void irq_domain_update_bus_token(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { char *name; if (domain->bus_token == bus_token) return; mutex_lock(&irq_domain_mutex); domain->bus_token = bus_token; name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); if (!name) { mutex_unlock(&irq_domain_mutex); return; } debugfs_remove_domain_dir(domain); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); else domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; domain->name = name; debugfs_add_domain_dir(domain); mutex_unlock(&irq_domain_mutex); } EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most * interrupt controllers. If device tree is used, then first_irq will be 0 and * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = size, .hwirq_max = size, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, true, false); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_simple); struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = first_hwirq + size, .hwirq_max = first_hwirq + size, .hwirq_base = first_hwirq, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, false, true); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_legacy); /** * irq_find_matching_fwspec() - Locates a domain for a given fwspec * @fwspec: FW specifier for an interrupt * @bus_token: domain-specific data */ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; struct fwnode_handle *fwnode = fwspec->fwnode; int rc; /* * We might want to match the legacy controller last since * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... * * bus_token == DOMAIN_BUS_ANY matches any domain, any other * values must generate an exact match for the domain to be * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->select && bus_token != DOMAIN_BUS_ANY) rc = h->ops->select(h, fwspec, bus_token); else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); else rc = ((fwnode != NULL) && (h->fwnode == fwnode) && ((bus_token == DOMAIN_BUS_ANY) || (h->bus_token == bus_token))); if (rc) { found = h; break; } } mutex_unlock(&irq_domain_mutex); return found; } EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used * whenever NULL is passed to irq_create_mapping(). It makes life easier for * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } EXPORT_SYMBOL_GPL(irq_set_default_domain); /** * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * * Modern code should never use this. This should only be used on * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP); } static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], NULL); else radix_tree_delete(&domain->revmap_tree, hwirq); } static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { /* * This also makes sure that all domains point to the same root when * called from irq_domain_insert_irq() for each domain in a hierarchy. */ lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], irq_data); else radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); } static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; if (WARN(!irq_data || irq_data->domain != domain, "virq%i doesn't exist; cannot disassociate\n", irq)) return; hwirq = irq_data->hwirq; mutex_lock(&domain->root->mutex); irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ irq_set_chip_and_handler(irq, NULL, NULL); /* Make sure it's completed */ synchronize_irq(irq); /* Tell the PIC about it */ if (domain->ops->unmap) domain->ops->unmap(domain, irq); smp_mb(); irq_data->domain = NULL; irq_data->hwirq = 0; domain->mapcount--; /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); mutex_unlock(&domain->root->mutex); } static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; if (WARN(hwirq >= domain->hwirq_max, "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) return -EINVAL; if (WARN(!irq_data, "error: virq%i is not allocated", virq)) return -EINVAL; if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; return ret; } } domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } int irq_domain_associate(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { int ret; mutex_lock(&domain->root->mutex); ret = irq_domain_associate_locked(domain, virq, hwirq); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { struct device_node *of_node; int i; of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) irq_domain_associate(domain, irq_base + i, hwirq_base + i); } EXPORT_SYMBOL_GPL(irq_domain_associate_many); #ifdef CONFIG_IRQ_DOMAIN_NOMAP /** * irq_create_direct_mapping() - Allocate an irq for direct mapping * @domain: domain to allocate the irq for or NULL for default domain * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use * the linux irq as the hardware interrupt number. It still uses the linear * or radix tree to store the mapping, but the irq controller can optimize * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; of_node = irq_domain_get_of_node(domain); virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } if (virq >= domain->hwirq_max) { pr_err("ERROR: no free irqs available below %lu maximum\n", domain->hwirq_max); irq_free_desc(virq); return 0; } pr_debug("create_direct obtained virq %d\n", virq); if (irq_domain_associate(domain, virq, virq)) { irq_free_desc(virq); return 0; } return virq; } EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { struct device_node *of_node = irq_domain_get_of_node(domain); int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); /* Allocate a virtual interrupt number */ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; } if (irq_domain_associate_locked(domain, virq, hwirq)) { irq_free_desc(virq); return 0; } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", hwirq, of_node_full_name(of_node), virq); return virq; } /** * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { int virq; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } mutex_lock(&domain->root->mutex); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("existing mapping on virq %d\n", virq); goto out; } virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); static int irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type) { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (d->ops->translate) return d->ops->translate(d, fwspec, hwirq, type); #endif if (d->ops->xlate) return d->ops->xlate(d, to_of_node(fwspec->fwnode), fwspec->param, fwspec->param_count, hwirq, type); /* If domain has no translation, then we assume interrupt line */ *hwirq = fwspec->param[0]; return 0; } void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec) { int i; fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) fwspec->param[i] = args[i]; } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); if (!domain) domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY); } else { domain = irq_default_domain; } return domain; } #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) { struct irq_domain *domain = fwspec_to_domain(fwspec); memset(info, 0, sizeof(*info)); if (!domain || !domain->ops->get_fwspec_info) return 0; return domain->ops->get_fwspec_info(fwspec, info); } #endif unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { unsigned int type = IRQ_TYPE_NONE; struct irq_domain *domain; struct irq_data *irq_data; irq_hw_number_t hwirq; int virq; domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; /* * WARN if the irqchip returns a type with bits * outside the sense mask set and clear these bits. */ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; mutex_lock(&domain->root->mutex); /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. */ virq = irq_find_mapping(domain, hwirq); if (virq) { /* * If the trigger type is not specified or matches the * current trigger type then we are done so return the * interrupt number. */ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) goto out; /* * If the trigger type has not been set yet, then set * it now and return the interrupt number. */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { irq_data = irq_get_irq_data(virq); if (!irq_data) { virq = 0; goto out; } irqd_set_trigger_type(irq_data, type); goto out; } pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); virq = 0; goto out; } if (irq_domain_is_hierarchy(domain)) { if (irq_domain_is_msi_device(domain)) { mutex_unlock(&domain->root->mutex); virq = msi_device_domain_alloc_wired(domain, hwirq, type); mutex_lock(&domain->root->mutex); } else virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, fwspec, false, NULL); if (virq <= 0) { virq = 0; goto out; } } else { /* Create mapping */ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL); if (!virq) goto out; } irq_data = irq_get_irq_data(virq); if (WARN_ON(!irq_data)) { virq = 0; goto out; } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(irq_data->np, irq_data->args, irq_data->args_count, &fwspec); return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** * irq_dispose_mapping() - Unmap an interrupt * @virq: linux irq number of the interrupt to unmap */ void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data; struct irq_domain *domain; irq_data = virq ? irq_get_irq_data(virq) : NULL; if (!irq_data) return; domain = irq_data->domain; if (WARN_ON(domain == NULL)) return; if (irq_domain_is_hierarchy(domain)) { irq_domain_free_one_irq(domain, virq); } else { irq_domain_disassociate(domain, virq); irq_free_desc(virq); } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** * __irq_resolve_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space * @irq: optional pointer to return the Linux irq if required * * Returns the interrupt descriptor. */ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, unsigned int *irq) { struct irq_desc *desc = NULL; struct irq_data *data; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) return desc; if (irq_domain_is_nomap(domain)) { if (hwirq < domain->hwirq_max) { data = irq_domain_get_irq_data(domain, hwirq); if (data && data->hwirq == hwirq) desc = irq_data_to_desc(data); if (irq && desc) *irq = hwirq; } return desc; } rcu_read_lock(); /* Check if the hwirq is in the linear revmap. */ if (hwirq < domain->revmap_size) data = rcu_dereference(domain->revmap[hwirq]); else data = radix_tree_lookup(&domain->revmap_tree, hwirq); if (likely(data)) { desc = irq_data_to_desc(data); if (irq) *irq = data->irq; } rcu_read_unlock(); return desc; } EXPORT_SYMBOL_GPL(__irq_resolve_mapping); /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with one cell * bindings where the cell value maps directly to the hwirq number. */ int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); /** * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree interrupt specifier translation function for two or three * cell bindings, where the cell values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); /** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with either one * or two cell bindings where the cell values map directly to the hwirq number * and linux irq flags. * * Note: don't use this function unless your interrupt controller explicitly * supports both one and two cell bindings. For the majority of controllers * the _onecell() or _twocell() variants above should be used. */ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; if (intsize > 1) *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; else *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); /** * irq_domain_translate_onecell() - Generic translate for direct one cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type */ int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 1)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_onecell); /** * irq_domain_translate_twocell() - Generic translate for direct two cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 2)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); /** * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Firmware interrupt specifier translation function for two or three cell * specifications, where the parameter values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (fwspec->param_count == 2) { *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } if (fwspec->param_count == 3) { *out_hwirq = fwspec->param[1]; *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { unsigned int hint; if (virq >= 0) { virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, affinity); } else { hint = hwirq % irq_get_nr_irqs(); if (hint == 0) hint++; virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, affinity); if (virq <= 0 && hint > 1) { virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, affinity); } } return virq; } /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data * @irq_data: The pointer to irq_data */ void irq_domain_reset_irq_data(struct irq_data *irq_data) { irq_data->hwirq = 0; irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY static void irq_domain_insert_irq(int virq) { struct irq_data *data; for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; domain->mapcount++; irq_domain_set_mapping(domain, data->hwirq, data); } irq_clear_status_flags(virq, IRQ_NOREQUEST); } static void irq_domain_remove_irq(int virq) { struct irq_data *data; irq_set_status_flags(virq, IRQ_NOREQUEST); irq_set_chip_and_handler(virq, NULL, NULL); synchronize_irq(virq); smp_mb(); for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; domain->mapcount--; irq_domain_clear_mapping(domain, hwirq); } } static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, struct irq_data *child) { struct irq_data *irq_data; irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, irq_data_get_node(child)); if (irq_data) { child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; irq_data->domain = domain; } return irq_data; } static void __irq_domain_free_hierarchy(struct irq_data *irq_data) { struct irq_data *tmp; while (irq_data) { tmp = irq_data; irq_data = irq_data->parent_data; kfree(tmp); } } static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); tmp = irq_data->parent_data; irq_data->parent_data = NULL; irq_data->domain = NULL; __irq_domain_free_hierarchy(tmp); } } /** * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy * @domain: IRQ domain from which the hierarchy is to be disconnected * @virq: IRQ number where the hierarchy is to be trimmed * * Marks the @virq level belonging to @domain as disconnected. * Returns -EINVAL if @virq doesn't have a valid irq_data pointing * to @domain. * * Its only use is to be able to trim levels of hierarchy that do not * have any real meaning for this interrupt, and that the driver marks * as such from its .alloc() callback. */ int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq) { struct irq_data *irqd; irqd = irq_domain_get_irq_data(domain, virq); if (!irqd) return -EINVAL; irqd->chip = ERR_PTR(-ENOTCONN); return 0; } EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy); static int irq_domain_trim_hierarchy(unsigned int virq) { struct irq_data *tail, *irqd, *irq_data; irq_data = irq_get_irq_data(virq); tail = NULL; /* The first entry must have a valid irqchip */ if (IS_ERR_OR_NULL(irq_data->chip)) return -EINVAL; /* * Validate that the irq_data chain is sane in the presence of * a hierarchy trimming marker. */ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { /* Can't have a valid irqchip after a trim marker */ if (irqd->chip && tail) return -EINVAL; /* Can't have an empty irqchip before a trim marker */ if (!irqd->chip && !tail) return -EINVAL; if (IS_ERR(irqd->chip)) { /* Only -ENOTCONN is a valid trim marker */ if (PTR_ERR(irqd->chip) != -ENOTCONN) return -EINVAL; tail = irq_data; } } /* No trim marker, nothing to do */ if (!tail) return 0; pr_info("IRQ%d: trimming hierarchy from %s\n", virq, tail->parent_data->domain->name); /* Sever the inner part of the hierarchy... */ irqd = tail; tail = tail->parent_data; irqd->parent_data = NULL; __irq_domain_free_hierarchy(tail); return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; struct irq_domain *parent; int i; /* The outermost irq_data is embedded in struct irq_desc */ for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); irq_data->domain = domain; for (parent = domain->parent; parent; parent = parent->parent) { irq_data = irq_domain_insert_irq_data(parent, irq_data); if (!irq_data) { irq_domain_free_irq_data(virq, i + 1); return -ENOMEM; } } } return 0; } /** * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data; for (irq_data = irq_get_irq_data(virq); irq_data; irq_data = irq_data->parent_data) if (irq_data->domain == domain) return irq_data; return NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hwirq number * @chip: The associated interrupt chip * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); if (!irq_data) return -ENOENT; irq_data->hwirq = hwirq; irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; } EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); if (irq_data) irq_domain_reset_irq_data(irq_data); } irq_domain_free_irqs_parent(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common); /** * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { int i; for (i = 0; i < nr_irqs; i++) { irq_set_handler_data(virq + i, NULL); irq_set_handler(virq + i, NULL); } irq_domain_free_irqs_common(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { unsigned int i; if (!domain->ops->free) return; for (i = 0; i < nr_irqs; i++) { if (irq_domain_get_irq_data(domain, irq_base + i)) domain->ops->free(domain, irq_base + i, 1); } } static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); return -ENOSYS; } return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; if (realloc && irq_base >= 0) { virq = irq_base; } else { virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); return virq; } } if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { pr_debug("cannot allocate memory for IRQ%d\n", virq); ret = -ENOMEM; goto out_free_desc; } ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); if (ret < 0) goto out_free_irq_data; for (i = 0; i < nr_irqs; i++) { ret = irq_domain_trim_hierarchy(virq + i); if (ret) goto out_free_irq_data; } for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); return virq; out_free_irq_data: irq_domain_free_irq_data(virq, nr_irqs); out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. * Parameter @realloc is mainly to support legacy IRQs. * Returns error code or allocated IRQ number * * The whole process to setup an IRQ has been split into two steps. * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ * descriptor and required hardware resources. The second step, * irq_domain_activate_irq(), is to program the hardware with preallocated * resources. In this way, it's easier to rollback when failing to * allocate resources. */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int ret; if (domain == NULL) { domain = irq_default_domain; if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) return -EINVAL; } mutex_lock(&domain->root->mutex); ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, realloc, affinity); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; lockdep_assert_held(&d->domain->root->mutex); if (irq_domain_is_nomap(d->domain)) return; /* Fix up the revmap. */ if (d->hwirq < d->domain->revmap_size) { /* Not using radix tree */ rcu_assign_pointer(d->domain->revmap[d->hwirq], d); } else { slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); } } /** * irq_domain_push_irq() - Push a domain in to the top of a hierarchy. * @domain: Domain to push. * @virq: Irq to push the domain in to. * @arg: Passed to the irq_domain_ops alloc() function. * * For an already existing irqdomain hierarchy, as might be obtained * via a call to pci_enable_msix(), add an additional domain to the * head of the processing chain. Must be called before request_irq() * has been called. */ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_desc *desc; int rv = 0; /* * Check that no action has been set, which indicates the virq * is in a state where this function doesn't have to deal with * races between interrupt handling and maintaining the * hierarchy. This will catch gross misuse. Attempting to * make the check race free would require holding locks across * calls to struct irq_domain_ops->alloc(), which could lead * to deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; if (!irq_data) return -EINVAL; if (domain->parent != irq_data->domain) return -EINVAL; parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, irq_data_get_node(irq_data)); if (!parent_irq_data) return -ENOMEM; mutex_lock(&domain->root->mutex); /* Copy the original irq_data. */ *parent_irq_data = *irq_data; /* * Overwrite the irq_data, which is embedded in struct irq_desc, with * values for this domain. */ irq_data->parent_data = parent_irq_data; irq_data->domain = domain; irq_data->mask = 0; irq_data->hwirq = 0; irq_data->chip = NULL; irq_data->chip_data = NULL; /* May (probably does) set hwirq, chip, etc. */ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (rv) { /* Restore the original irq_data. */ *irq_data = *parent_irq_data; kfree(parent_irq_data); goto error; } irq_domain_fix_revmap(parent_irq_data); irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: mutex_unlock(&domain->root->mutex); return rv; } EXPORT_SYMBOL_GPL(irq_domain_push_irq); /** * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy. * @domain: Domain to remove. * @virq: Irq to remove the domain from. * * Undo the effects of a call to irq_domain_push_irq(). Must be * called either before request_irq() or after free_irq(). */ int irq_domain_pop_irq(struct irq_domain *domain, int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_data *tmp_irq_data; struct irq_desc *desc; /* * Check that no action is set, which indicates the virq is in * a state where this function doesn't have to deal with races * between interrupt handling and maintaining the hierarchy. * This will catch gross misuse. Attempting to make the check * race free would require holding locks across calls to * struct irq_domain_ops->free(), which could lead to * deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (!irq_data) return -EINVAL; tmp_irq_data = irq_domain_get_irq_data(domain, virq); /* We can only "pop" if this domain is at the top of the list */ if (WARN_ON(irq_data != tmp_irq_data)) return -EINVAL; if (WARN_ON(irq_data->domain != domain)) return -EINVAL; parent_irq_data = irq_data->parent_data; if (WARN_ON(!parent_irq_data)) return -EINVAL; mutex_lock(&domain->root->mutex); irq_data->parent_data = NULL; irq_domain_clear_mapping(domain, irq_data->hwirq); irq_domain_free_irqs_hierarchy(domain, virq, 1); /* Restore the original irq_data. */ *irq_data = *parent_irq_data; irq_domain_fix_revmap(irq_data); mutex_unlock(&domain->root->mutex); kfree(parent_irq_data); return 0; } EXPORT_SYMBOL_GPL(irq_domain_pop_irq); /** * irq_domain_free_irqs - Free IRQ number and associated data structures * @virq: base IRQ number * @nr_irqs: number of IRQs to free */ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain *domain; int i; if (WARN(!data || !data->domain || !data->domain->ops->free, "NULL pointer, cannot free irq\n")) return; domain = data->domain; mutex_lock(&domain->root->mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); mutex_unlock(&domain->root->mutex); irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { if (irq_domain_is_msi_device(domain)) msi_device_domain_free_wired(domain, virq); else irq_domain_free_irqs(virq, 1); } /** * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain * @domain: Domain below which interrupts must be allocated * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to allocate * @arg: Allocation data (arch/domain specific) */ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->parent) return -ENOSYS; return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, nr_irqs, arg); } EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain * @domain: Domain below which interrupts must be freed * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to free */ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { if (!domain->parent) return; irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { struct irq_domain *domain = irq_data->domain; if (domain->ops->deactivate) domain->ops->deactivate(domain, irq_data); if (irq_data->parent_data) __irq_domain_deactivate_irq(irq_data->parent_data); } } static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; if (irqd && irqd->domain) { struct irq_domain *domain = irqd->domain; if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, reserve); if (!ret && domain->ops->activate) { ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); } } return ret; } /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt * @irq_data: Outermost irq_data associated with interrupt * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; } /** * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to * deactivate interrupt * @irq_data: outermost irq_data associated with interrupt * * It calls domain_ops->deactivate to program interrupt controllers to disable * interrupt delivery. */ void irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irqd_is_activated(irq_data)) { __irq_domain_deactivate_irq(irq_data); irqd_clr_activated(irq_data); } } static void irq_domain_check_hierarchy(struct irq_domain *domain) { /* Hierarchy irq_domains must implement callback alloc() */ if (domain->ops->alloc) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_set_chip_and_handler_name(virq, chip, handler, handler_name); irq_set_chip_data(virq, chip_data); irq_set_handler_data(virq, handler_data); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { return -EINVAL; } static void irq_domain_check_hierarchy(struct irq_domain *domain) { } static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include "internals.h" static struct dentry *domain_dir; static const struct irq_bit_descr irqdomain_flags[] = { BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY), BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE), }; static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) { seq_printf(m, "%*sname: %s\n", ind, "", d->name); seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags)); if (d->ops && d->ops->debug_show) d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); irq_domain_debug_show_one(m, d->parent, ind + 4); #endif } static int irq_domain_debug_show(struct seq_file *m, void *p) { struct irq_domain *d = m->private; /* Default domain? Might be NULL */ if (!d) { if (!irq_default_domain) return 0; d = irq_default_domain; } irq_domain_debug_show_one(m, d, 0); return 0; } DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir) return; debugfs_create_file(d->name, 0444, domain_dir, d, &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) { struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); mutex_unlock(&irq_domain_mutex); } #endif
15 5 5 5 5 5 12 11 11 25 25 25 14 12 12 12 12 25 1 1 1 9 9 9 9 9 62 50 43 43 43 43 50 52 62 61 62 62 1 50 49 49 4 49 49 36 35 35 5 18 18 10 3 10 15 61 1 2 62 62 62 5 5 5 5 5 62 9 9 9 9 9 9 9 9 9 9 9 9 9 19 6 6 6 20 14 20 15 14 20 20 20 20 28 28 28 3 27 3 28 3 27 27 26 3 3 3 3 28 28 14 13 1 14 14 8 7 13 13 14 3 2 2 2 2 77 3 1 77 1 1 77 77 77 77 62 5 62 62 62 61 62 61 62 56 57 16 9 9 57 5 5 57 62 58 59 58 58 1 58 58 59 58 43 3 7 7 6 5 42 43 43 43 13 61 61 61 1 60 4 4 54 37 60 33 27 48 45 45 60 3 60 60 15 50 1 1 1 1 60 48 1 48 48 16 12 48 60 62 62 62 62 61 62 1 62 62 5 62 3 60 11 11 11 6 6 6 3 3 3 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Novell Inc. * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/fs.h> #include <linux/cred.h> #include <linux/ctype.h> #include <linux/hex.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> #include <linux/mount.h> #include <linux/exportfs.h> #include "overlayfs.h" struct ovl_lookup_data { struct super_block *sb; struct dentry *dentry; const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; bool xwhiteouts; bool stop; bool last; char *redirect; char *upperredirect; int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; }; static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); d->absolute_redirect = false; buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); if (buf[0] == '/') { d->absolute_redirect = true; /* * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and * that will stop further lookup in lower layers (d->stop=true) * But we have found an absolute redirect in descendant path * element and that should force continue lookup in lower * layers (reset d->stop). */ d->stop = false; } else { res = strlen(buf) + 1; memmove(buf + prelen, buf, res); memcpy(buf, d->name.name, prelen); } strcat(buf, post); kfree(d->redirect); d->redirect = buf; d->name.name = d->redirect; d->name.len = strlen(d->redirect); return 0; } static int ovl_acceptable(void *ctx, struct dentry *dentry) { /* * A non-dir origin may be disconnected, which is fine, because * we only need it for its unique inode number. */ if (!d_is_dir(dentry)) return 1; /* Don't decode a deleted empty directory */ if (d_unhashed(dentry)) return 0; /* Check if directory belongs to the layer we are decoding from */ return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); } /* * Check validity of an overlay file handle buffer. * * Return 0 for a valid file handle. * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; } static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; goto fail; } /* Zero size value means "copied up but origin unknown" */ if (res == 0) return NULL; fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); if (res < 0) goto fail; err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; goto invalid; } return fh; out: kfree(fh); return NULL; fail: pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } bool ovl_uuid_match(struct ovl_fs *ofs, const struct super_block *sb, const uuid_t *uuid) { /* * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. * In case of uuid=off option just make sure that stored uuid is null. */ return ovl_origin_uuid(ofs) ? uuid_equal(uuid, &sb->s_uuid) : uuid_is_null(uuid); } struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, struct vfsmount *mnt, bool connected) { struct dentry *real; int bytes; if (!capable(CAP_DAC_READ_SEARCH)) return NULL; if (!ovl_uuid_match(ofs, mnt->mnt_sb, &fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". * upper file handle could become stale when upper file is * unlinked and this information is needed to handle stale * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } if (ovl_dentry_weird(real)) { dput(real); return NULL; } return real; } static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), &QSTR_LEN(name, len), base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { spin_lock(&ret->d_lock); /* Recheck condition under lock */ if (d_is_negative(ret) && ret->d_lockref.count == 1) __d_drop(ret); spin_unlock(&ret->d_lock); } dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this = NULL; const char *warn; struct path path; int err; bool last_element = !post[0]; bool is_upper = d->layer->idx == 0; char val; /* * We allow filesystems that are case-folding capable as long as the * layers are consistently enabled in the stack, enabled for every dir * or disabled in all dirs. If someone has modified case folding on a * directory on underlying layer, the warranty of the ovl stack is * voided. */ if (ofs->casefold != ovl_dentry_casefolded(base)) { warn = "parent wrong casefold"; err = -ESTALE; goto out_warn; } this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; if (err == -ENOENT || err == -ENAMETOOLONG) goto out; goto out_err; } if (ofs->casefold != ovl_dentry_casefolded(this)) { warn = "child wrong casefold"; err = -EREMOTE; goto out_warn; } if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ warn = "unsupported object type"; err = -EREMOTE; goto out_warn; } path.dentry = this; path.mnt = d->layer->mnt; if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } /* * This dentry should be a regular file if previous layer lookup * found a metacopy dentry. */ if (last_element && d->metacopy && !d_is_reg(this)) { d->stop = true; goto put_and_out; } if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; d->metacopy = err; d->stop = !d->metacopy; if (!d->metacopy || d->last) goto out; } else { if (ovl_lookup_trap_inode(d->sb, this)) { /* Caught in a trap of overlapping layers */ warn = "overlapping layers"; err = -ELOOP; goto out_warn; } if (last_element) d->is_dir = true; if (d->last) goto out; /* overlay.opaque=x means xwhiteouts directory */ val = ovl_get_opaquedir_val(ofs, &path); if (last_element && !is_upper && val == 'x') { d->xwhiteouts = true; ovl_layer_set_xwhiteouts(ofs, d->layer); } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; goto out; } } err = ovl_check_redirect(&path, d, prelen, post); if (err) goto out_err; out: *ret = this; return 0; put_and_out: dput(this); this = NULL; goto out; out_warn: pr_warn_ratelimited("failed lookup in %s (%pd2, name='%.*s', err=%i): %s\n", is_upper ? "upper" : "lower", base, namelen, name, err, warn); out_err: dput(this); return err; } static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { /* Counting down from the end, since the prefix can change */ size_t rem = d->name.len - 1; struct dentry *dentry = NULL; int err; if (d->name.name[0] != '/') return ovl_lookup_single(base, d, d->name.name, d->name.len, 0, "", ret, drop_negative); while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { const char *s = d->name.name + d->name.len - rem; const char *next = strchrnul(s, '/'); size_t thislen = next - s; bool end = !next[0]; /* Verify we did not go off the rails */ if (WARN_ON(s[-1] != '/')) return -EIO; err = ovl_lookup_single(base, d, s, thislen, d->name.len - rem, next, &base, drop_negative); dput(dentry); if (err) return err; dentry = base; if (end) break; rem -= thislen + 1; if (WARN_ON(rem >= d->name.len)) return -EIO; } *ret = dentry; return 0; } static int ovl_lookup_data_layer(struct dentry *dentry, const char *redirect, const struct ovl_layer *layer, struct path *datapath) { int err; err = vfs_path_lookup(layer->mnt->mnt_root, layer->mnt, redirect, LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_XDEV, datapath); pr_debug("lookup lowerdata (%pd2, redirect=\"%s\", layer=%d, err=%i)\n", dentry, redirect, layer->idx, err); if (err) return err; err = -EREMOTE; if (ovl_dentry_weird(datapath->dentry)) goto out_path_put; err = -ENOENT; /* Only regular file is acceptable as lower data */ if (!d_is_reg(datapath->dentry)) goto out_path_put; return 0; out_path_put: path_put(datapath); return err; } /* Lookup in data-only layers by absolute redirect to layer root */ static int ovl_lookup_data_layers(struct dentry *dentry, const char *redirect, struct ovl_path *lowerdata) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); const struct ovl_layer *layer; struct path datapath; int err = -ENOENT; int i; layer = &ofs->layers[ofs->numlayer - ofs->numdatalayer]; for (i = 0; i < ofs->numdatalayer; i++, layer++) { err = ovl_lookup_data_layer(dentry, redirect, layer, &datapath); if (!err) { mntput(datapath.mnt); lowerdata->dentry = datapath.dentry; lowerdata->layer = layer; return 0; } } return err; } int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 1; i <= ovl_numlowerlayer(ofs); i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. */ if (ofs->layers[i].fsid && ofs->layers[i].fs->bad_uuid) continue; origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt, connected); if (origin) break; } if (!origin) return -ESTALE; else if (IS_ERR(origin)) return PTR_ERR(origin); if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) && inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; if (!*stackp) *stackp = kmalloc_obj(struct ovl_path); if (!*stackp) { dput(origin); return -ENOMEM; } **stackp = (struct ovl_path){ .dentry = origin, .layer = &ofs->layers[i] }; return 0; invalid: pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); return -ESTALE; } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, struct ovl_path **stackp) { struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN); int err; if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { if (err == -ESTALE) return 0; return err; } return 0; } /* * Verify that @fh matches the file handle stored in xattr @name. * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh) { struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox); int err = 0; if (!ofh) return -ENODATA; if (IS_ERR(ofh)) return PTR_ERR(ofh); if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); return err; } int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh, bool is_upper, bool set) { int err; err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); return err; } /* * Verify that @real dentry matches the file handle stored in xattr @name. * * If @set is true and there is no stored file handle, encode @real and store * file handle in xattr @name. * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; goto fail; } err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; out: kfree(fh); return err; fail: inode = d_inode(real); pr_warn_ratelimited("failed to verify %s (%pd2, ino=%llu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; } /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) { struct ovl_fh *fh; struct dentry *upper; if (!d_is_dir(index)) return dget(index); fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER); if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); } return upper; } /* * Verify that an index entry name matches the origin file handle stored in * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) { struct ovl_fh *fh = NULL; size_t len; struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *upper = NULL; int err; if (!d_inode(index)) return 0; err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; /* * Whiteout index entries are used as an indication that an exported * overlay file handle should be treated as stale (i.e. after unlink * of the overlay inode). These entries contain no origin xattr. */ if (ovl_is_whiteout(index)) goto out; /* * Verifying directory index entries are not stale is expensive, so * only verify stale dir index if NFS export is enabled. */ if (d_is_dir(index) && !ofs->config.nfs_export) goto out; /* * Directory index entries should have 'upper' xattr pointing to the * real upper dir. Non-dir index entries are hardlinks to the upper * real inode. For non-dir index, we can read the copy up origin xattr * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* * Directory index entries with no 'upper' xattr need to be * removed. When dir index entry has a stale 'upper' xattr, * we assume that upper dir was removed and we treat the dir * index as orphan entry that needs to be whited out. */ if (err == -ESTALE) goto orphan; else if (!err) err = -ESTALE; goto fail; } err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh); dput(upper); if (err) goto fail; /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0) goto orphan; } out: dput(origin.dentry); kfree(fh); return err; fail: pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; goto out; } int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; } /* * Lookup in indexdir for the index entry of a lower real inode or a copy up * origin inode. The index entry name is the hex representation of the lower * inode file handle. * * If the index dentry in negative, then either no lower aliases have been * copied up yet, or aliases have been copied up in older kernels and are * not indexed. * * If the index dentry for a copy up origin inode is positive, but points * to an inode different than the upper inode, then either the upper inode * has been copied up and not indexed or it was indexed, but since then * index dir was cleared. Either way, that index cannot be used to identify * the overlay inode. */ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name) { struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, d_inode(origin), false); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_get_index_name_fh(fh, name); kfree(fh); return err; } /* Lookup index by file handle for NFS export */ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) { struct dentry *index; struct qstr name; int err; err = ovl_get_index_name_fh(fh, &name); if (err) return ERR_PTR(err); index = lookup_noperm_positive_unlocked(&name, ofs->workdir); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) index = NULL; return index; } if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; else return index; dput(index); return ERR_PTR(err); } struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify) { struct dentry *index; struct inode *inode; struct qstr name; bool is_dir = d_is_dir(origin); int err; err = ovl_get_index_name(ofs, origin, &name); if (err) return ERR_PTR(err); index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name, ofs->workdir); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { index = NULL; goto out; } pr_warn_ratelimited("failed inode index lookup (ino=%llu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); goto out; } inode = d_inode(index); if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode * should treat file handle as stale and no need to print a * warning about it. */ dput(index); index = ERR_PTR(-ESTALE); goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || inode_wrong_type(inode, d_inode(origin)->i_mode)) { /* * Index should always be of the same file type as origin * except for the case of a whiteout index. A whiteout * index should only exist if all lower aliases have been * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } /* Verify that dir index 'upper' xattr points to upper dir */ err = ovl_verify_upper(ofs, index, upper, false); if (err) { if (err == -ESTALE) { pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; } } else if (upper && d_inode(upper) != inode) { goto out_dput; } out: kfree(name.name); return index; out_dput: dput(index); index = NULL; goto out; fail: dput(index); index = ERR_PTR(-EIO); goto out; } /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path, const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); if (path->dentry) { *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; *layer = lowerstack[idx - 1].layer; path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } /* Fix missing 'origin' xattr */ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; fh = ovl_get_origin_fh(ofs, lower); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_want_write(dentry); if (err) goto out; err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); out: kfree(fh); return err; } static int ovl_maybe_validate_verity(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct inode *inode = d_inode(dentry); struct path datapath, metapath; int err; if (!ofs->config.verity_mode || !ovl_is_metacopy_dentry(dentry) || ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) return 0; if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) { if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", dentry); return -EIO; } return 0; } ovl_path_lowerdata(dentry, &datapath); if (!datapath.dentry) return -EIO; ovl_path_real(dentry, &metapath); if (!metapath.dentry) return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { with_ovl_creds(dentry->d_sb) err = ovl_validate_verity(ofs, &metapath, &datapath); if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); } ovl_inode_unlock(inode); return err; } /* Lazy lookup of lowerdata */ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); struct ovl_path datapath = {}; int err; if (!redirect || ovl_dentry_lowerdata(dentry)) return 0; if (redirect[0] != '/') return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; err = 0; /* Someone got here before us? */ if (ovl_dentry_lowerdata(dentry)) goto out; with_ovl_creds(dentry->d_sb) err = ovl_lookup_data_layers(dentry, redirect, &datapath); if (err) goto out_err; err = ovl_dentry_set_lowerdata(dentry, &datapath); if (err) goto out_err; out: ovl_inode_unlock(inode); dput(datapath.dentry); return err; out_err: pr_warn_ratelimited("lazy lowerdata lookup failed (%pd2, err=%i)\n", dentry, err); goto out; } int ovl_verify_lowerdata(struct dentry *dentry) { int err; err = ovl_maybe_lookup_lowerdata(dentry); if (err) return err; return ovl_maybe_validate_verity(dentry); } /* * Following redirects/metacopy can have security consequences: it's like a * symlink into the lower layer without the permission checks. * * This is only a problem if the upper layer is untrusted (e.g comes from an USB * drive). This can allow a non-readable file or directory to become readable. * * Only following redirects when redirects are enabled disables this attack * vector when not necessary. */ static bool ovl_check_follow_redirect(struct ovl_lookup_data *d) { struct ovl_fs *ofs = OVL_FS(d->sb); if (d->metacopy && !ofs->config.metacopy) { pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", d->dentry); return false; } if ((d->redirect || d->upperredirect) && !ovl_redirect_follow(ofs)) { pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", d->dentry); return false; } return true; } struct ovl_lookup_ctx { struct dentry *dentry; struct ovl_entry *oe; struct ovl_path *stack; struct ovl_path *origin_path; struct dentry *upperdentry; struct dentry *index; struct inode *inode; unsigned int ctr; }; static int ovl_lookup_layers(struct ovl_lookup_ctx *ctx, struct ovl_lookup_data *d) { struct dentry *dentry = ctx->dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); struct dentry *upperdir; struct dentry *this; struct dentry *origin = NULL; bool upperopaque = false; bool uppermetacopy = false; int metacopy_size = 0; unsigned int i; int err; upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { d->layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, d, &ctx->upperdentry, true); if (err) return err; if (ctx->upperdentry && ctx->upperdentry->d_flags & DCACHE_OP_REAL) return -EREMOTE; if (ctx->upperdentry && !d->is_dir) { /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, * because we only need to hold the origin inode in * cache and use its inode number. We may even get a * connected dentry, that is not under any of the lower * layers root. That is also fine for using it's inode * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ err = ovl_check_origin(ofs, ctx->upperdentry, &ctx->origin_path); if (err) return err; if (d->metacopy) uppermetacopy = true; metacopy_size = d->metacopy; } if (d->redirect) { err = -ENOMEM; d->upperredirect = kstrdup(d->redirect, GFP_KERNEL); if (!d->upperredirect) return err; if (d->redirect[0] == '/') poe = roe; } upperopaque = d->opaque; } if (!d->stop && ovl_numlower(poe)) { err = -ENOMEM; ctx->stack = ovl_stack_alloc(ofs->numlayer - 1); if (!ctx->stack) return err; } for (i = 0; !d->stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; if (!ovl_check_follow_redirect(d)) { err = -EPERM; return err; } if (!check_redirect) d->last = i == ovl_numlower(poe) - 1; else if (d->is_dir || !ofs->numdatalayer) d->last = lower.layer->idx == ovl_numlower(roe); d->layer = lower.layer; err = ovl_lookup_layer(lower.dentry, d, &this, false); if (err) return err; if (!this) continue; /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ if (ctx->upperdentry && !ctx->ctr && !ofs->noxattr && d->is_dir) { err = ovl_fix_origin(ofs, dentry, this, ctx->upperdentry); if (err) { dput(this); return err; } } /* * When "verify_lower" feature is enabled, do not merge with a * lower dir that does not match a stored origin xattr. In any * case, only verified origin is used for index lookup. * * For non-dir dentry, if index=on, then ensure origin * matches the dentry found using path based lookup, * otherwise error out. */ if (ctx->upperdentry && !ctx->ctr && ((d->is_dir && ovl_verify_lower(dentry->d_sb)) || (!d->is_dir && ofs->config.index && ctx->origin_path))) { err = ovl_verify_origin(ofs, ctx->upperdentry, this, false); if (err) { dput(this); if (d->is_dir) break; return err; } origin = this; } if (!ctx->upperdentry && !d->is_dir && !ctx->ctr && d->metacopy) metacopy_size = d->metacopy; if (d->metacopy && ctx->ctr) { /* * Do not store intermediate metacopy dentries in * lower chain, except top most lower metacopy dentry. * Continue the loop so that if there is an absolute * redirect on this dentry, poe can be reset to roe. */ dput(this); this = NULL; } else { ctx->stack[ctx->ctr].dentry = this; ctx->stack[ctx->ctr].layer = lower.layer; ctx->ctr++; } if (d->stop) break; if (d->redirect && d->redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ i = lower.layer->idx - 1; } } /* * Defer lookup of lowerdata in data-only layers to first access. * Don't require redirect=follow and metacopy=on in this case. */ if (d->metacopy && ctx->ctr && ofs->numdatalayer && d->absolute_redirect) { d->metacopy = 0; ctx->ctr++; } else if (!ovl_check_follow_redirect(d)) { err = -EPERM; return err; } /* * For regular non-metacopy upper dentries, there is no lower * path based lookup, hence ctr will be zero. If a dentry is found * using ORIGIN xattr on upper, install it in stack. * * For metacopy dentry, path based lookup will find lower dentries. * Just make sure a corresponding data dentry has been found. */ if (d->metacopy || (uppermetacopy && !ctx->ctr)) { pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", dentry); err = -EIO; return err; } else if (!d->is_dir && ctx->upperdentry && !ctx->ctr && ctx->origin_path) { if (WARN_ON(ctx->stack != NULL)) { err = -EIO; return err; } ctx->stack = ctx->origin_path; ctx->ctr = 1; origin = ctx->origin_path->dentry; ctx->origin_path = NULL; } /* * Always lookup index if there is no-upperdentry. * * For the case of upperdentry, we have set origin by now if it * needed to be set. There are basically three cases. * * For directories, lookup index by lower inode and verify it matches * upper inode. We only trust dir index if we verified that lower dir * matches origin, otherwise dir index entries may be inconsistent * and we ignore them. * * For regular upper, we already set origin if upper had ORIGIN * xattr. There is no verification though as there is no path * based dentry lookup in lower in this case. * * For metacopy upper, we set a verified origin already if index * is enabled and if upper had an ORIGIN xattr. * */ if (!ctx->upperdentry && ctx->ctr) origin = ctx->stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && (!d->is_dir || ovl_index_all(dentry->d_sb))) { ctx->index = ovl_lookup_index(ofs, ctx->upperdentry, origin, true); if (IS_ERR(ctx->index)) { err = PTR_ERR(ctx->index); ctx->index = NULL; return err; } } if (ctx->ctr) { ctx->oe = ovl_alloc_entry(ctx->ctr); err = -ENOMEM; if (!ctx->oe) return err; ovl_stack_cpy(ovl_lowerstack(ctx->oe), ctx->stack, ctx->ctr); } if (upperopaque) ovl_dentry_set_opaque(dentry); if (d->xwhiteouts) ovl_dentry_set_xwhiteouts(dentry); if (ctx->upperdentry) ovl_dentry_set_upper_alias(dentry); else if (ctx->index) { char *upperredirect; struct path upperpath = { .dentry = ctx->upperdentry = dget(ctx->index), .mnt = ovl_upper_mnt(ofs), }; /* * It's safe to assign upperredirect here: the previous * assignment happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) return PTR_ERR(upperredirect); d->upperredirect = upperredirect; err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) return err; d->metacopy = uppermetacopy = err; metacopy_size = err; if (!ovl_check_follow_redirect(d)) { err = -EPERM; return err; } } if (ctx->upperdentry || ctx->ctr) { struct inode *inode; struct ovl_inode_params oip = { .upperdentry = ctx->upperdentry, .oe = ctx->oe, .index = ctx->index, .redirect = d->upperredirect, }; /* Store lowerdata redirect for lazy lookup */ if (ctx->ctr > 1 && !d->is_dir && !ctx->stack[ctx->ctr - 1].dentry) { oip.lowerdata_redirect = d->redirect; d->redirect = NULL; } inode = ovl_get_inode(dentry->d_sb, &oip); if (IS_ERR(inode)) return PTR_ERR(inode); ctx->inode = inode; if (ctx->upperdentry && !uppermetacopy) ovl_set_flag(OVL_UPPERDATA, ctx->inode); if (metacopy_size > OVL_METACOPY_MIN_SIZE) ovl_set_flag(OVL_HAS_DIGEST, ctx->inode); } ovl_dentry_init_reval(dentry, ctx->upperdentry, OVL_I_E(ctx->inode)); return 0; } struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); int err; struct ovl_lookup_ctx ctx = { .dentry = dentry, }; struct ovl_lookup_data d = { .sb = dentry->d_sb, .dentry = dentry, .name = dentry->d_name, .last = check_redirect ? false : !ovl_numlower(poe), }; if (dentry->d_name.len > ofs->namelen) return ERR_PTR(-ENAMETOOLONG); with_ovl_creds(dentry->d_sb) err = ovl_lookup_layers(&ctx, &d); if (ctx.origin_path) { dput(ctx.origin_path->dentry); kfree(ctx.origin_path); } dput(ctx.index); ovl_stack_free(ctx.stack, ctx.ctr); kfree(d.redirect); if (err) { ovl_free_entry(ctx.oe); dput(ctx.upperdentry); kfree(d.upperredirect); return ERR_PTR(err); } return d_splice_alias(ctx.inode, dentry); } bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; unsigned int i; bool positive = false; bool done = false; /* * If dentry is negative, then lower is positive iff this is a * whiteout. */ if (!dentry->d_inode) return ovl_dentry_is_opaque(dentry); /* Negative upper -> positive lower */ if (!ovl_dentry_upper(dentry)) return true; with_ovl_creds(dentry->d_sb) { /* Positive upper -> have to look up lower to see whether it exists */ for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; /* * We need to make a non-const copy of dentry->d_name, * because lookup_one_positive_unlocked() will hash name * with parentpath base, which is on another (lower fs). */ this = lookup_one_positive_unlocked(mnt_idmap(parentpath->layer->mnt), &QSTR_LEN(name->name, name->len), parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: case -ENAMETOOLONG: break; default: /* * Assume something is there, we just couldn't * access it. */ positive = true; break; } } else { struct path path = { .dentry = this, .mnt = parentpath->layer->mnt, }; positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); done = true; dput(this); } } } return positive; }
1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 3 3 3 3 3 3 3 7 4 1 2 2 1 2 2 2 1 1 2 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 4 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 2 2 2 2 2 2 2 1 1 1 1 1 1 3 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 // SPDX-License-Identifier: GPL-2.0-only /* * HWSIM IEEE 802.15.4 interface * * (C) 2018 Mojatau, Alexander Aring <aring@mojatau.com> * Copyright 2007-2012 Siemens AG * * Based on fakelb, original Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/module.h> #include <linux/timer.h> #include <linux/platform_device.h> #include <linux/rtnetlink.h> #include <linux/netdevice.h> #include <linux/device.h> #include <linux/spinlock.h> #include <net/ieee802154_netdev.h> #include <net/mac802154.h> #include <net/cfg802154.h> #include <net/genetlink.h> #include "mac802154_hwsim.h" MODULE_DESCRIPTION("Software simulator of IEEE 802.15.4 radio(s) for mac802154"); MODULE_LICENSE("GPL"); static LIST_HEAD(hwsim_phys); static DEFINE_MUTEX(hwsim_phys_lock); static struct platform_device *mac802154hwsim_dev; /* MAC802154_HWSIM netlink family */ static struct genl_family hwsim_genl_family; static int hwsim_radio_idx; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, }; static const struct genl_multicast_group hwsim_mcgrps[] = { [HWSIM_MCGRP_CONFIG] = { .name = "config", }, }; struct hwsim_pib { u8 page; u8 channel; struct ieee802154_hw_addr_filt filt; enum ieee802154_filtering_level filt_level; struct rcu_head rcu; }; struct hwsim_edge_info { u8 lqi; struct rcu_head rcu; }; struct hwsim_edge { struct hwsim_phy *endpoint; struct hwsim_edge_info __rcu *info; struct list_head list; struct rcu_head rcu; }; struct hwsim_phy { struct ieee802154_hw *hw; u32 idx; struct hwsim_pib __rcu *pib; bool suspended; struct list_head edges; struct list_head list; }; static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init); static void hwsim_del(struct hwsim_phy *phy); static int hwsim_hw_ed(struct ieee802154_hw *hw, u8 *level) { *level = 0xbe; return 0; } static int hwsim_update_pib(struct ieee802154_hw *hw, u8 page, u8 channel, struct ieee802154_hw_addr_filt *filt, enum ieee802154_filtering_level filt_level) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib, *pib_old; pib = kzalloc_obj(*pib, GFP_ATOMIC); if (!pib) return -ENOMEM; pib_old = rtnl_dereference(phy->pib); pib->page = page; pib->channel = channel; pib->filt.short_addr = filt->short_addr; pib->filt.pan_id = filt->pan_id; pib->filt.ieee_addr = filt->ieee_addr; pib->filt.pan_coord = filt->pan_coord; pib->filt_level = filt_level; rcu_assign_pointer(phy->pib, pib); kfree_rcu(pib_old, rcu); return 0; } static int hwsim_hw_channel(struct ieee802154_hw *hw, u8 page, u8 channel) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, page, channel, &pib->filt, pib->filt_level); rcu_read_unlock(); return ret; } static int hwsim_hw_addr_filt(struct ieee802154_hw *hw, struct ieee802154_hw_addr_filt *filt, unsigned long changed) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, filt, pib->filt_level); rcu_read_unlock(); return ret; } static void hwsim_hw_receive(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_hdr hdr; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; rcu_read_lock(); pib = rcu_dereference(phy->pib); if (!pskb_may_pull(skb, 3)) { dev_dbg(hw->parent, "invalid frame\n"); goto drop; } memcpy(&hdr, skb->data, 3); /* Level 4 filtering: Frame fields validity */ if (pib->filt_level == IEEE802154_FILTERING_4_FRAME_FIELDS) { /* a) Drop reserved frame types */ switch (mac_cb(skb)->type) { case IEEE802154_FC_TYPE_BEACON: case IEEE802154_FC_TYPE_DATA: case IEEE802154_FC_TYPE_ACK: case IEEE802154_FC_TYPE_MAC_CMD: break; default: dev_dbg(hw->parent, "unrecognized frame type 0x%x\n", mac_cb(skb)->type); goto drop; } /* b) Drop reserved frame versions */ switch (hdr.fc.version) { case IEEE802154_2003_STD: case IEEE802154_2006_STD: case IEEE802154_STD: break; default: dev_dbg(hw->parent, "unrecognized frame version 0x%x\n", hdr.fc.version); goto drop; } /* c) PAN ID constraints */ if ((mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG || mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id && mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) { dev_dbg(hw->parent, "unrecognized PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } /* d1) Short address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT && mac_cb(skb)->dest.short_addr != pib->filt.short_addr && mac_cb(skb)->dest.short_addr != cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { dev_dbg(hw->parent, "unrecognized short address %04x\n", le16_to_cpu(mac_cb(skb)->dest.short_addr)); goto drop; } /* d2) Extended address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG && mac_cb(skb)->dest.extended_addr != pib->filt.ieee_addr) { dev_dbg(hw->parent, "unrecognized long address 0x%016llx\n", mac_cb(skb)->dest.extended_addr); goto drop; } /* d4) Specific PAN coordinator case (no parent) */ if ((mac_cb(skb)->type == IEEE802154_FC_TYPE_DATA || mac_cb(skb)->type == IEEE802154_FC_TYPE_MAC_CMD) && mac_cb(skb)->dest.mode == IEEE802154_ADDR_NONE) { dev_dbg(hw->parent, "relaying is not supported\n"); goto drop; } /* e) Beacon frames follow specific PAN ID rules */ if (mac_cb(skb)->type == IEEE802154_FC_TYPE_BEACON && pib->filt.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id) { dev_dbg(hw->parent, "invalid beacon PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } } rcu_read_unlock(); ieee802154_rx_irqsafe(hw, skb, lqi); return; drop: rcu_read_unlock(); kfree_skb(skb); } static int hwsim_hw_xmit(struct ieee802154_hw *hw, struct sk_buff *skb) { struct hwsim_phy *current_phy = hw->priv; struct hwsim_pib *current_pib, *endpoint_pib; struct hwsim_edge_info *einfo; struct hwsim_edge *e; WARN_ON(current_phy->suspended); rcu_read_lock(); current_pib = rcu_dereference(current_phy->pib); list_for_each_entry_rcu(e, &current_phy->edges, list) { /* Can be changed later in rx_irqsafe, but this is only a * performance tweak. Received radio should drop the frame * in mac802154 stack anyway... so we don't need to be * 100% of locking here to check on suspended */ if (e->endpoint->suspended) continue; endpoint_pib = rcu_dereference(e->endpoint->pib); if (current_pib->page == endpoint_pib->page && current_pib->channel == endpoint_pib->channel) { struct sk_buff *newskb = pskb_copy(skb, GFP_ATOMIC); einfo = rcu_dereference(e->info); if (newskb) hwsim_hw_receive(e->endpoint->hw, newskb, einfo->lqi); } } rcu_read_unlock(); ieee802154_xmit_complete(hw, skb, false); return 0; } static int hwsim_hw_start(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = false; return 0; } static void hwsim_hw_stop(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = true; } static int hwsim_set_promiscuous_mode(struct ieee802154_hw *hw, const bool on) { enum ieee802154_filtering_level filt_level; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; if (on) filt_level = IEEE802154_FILTERING_NONE; else filt_level = IEEE802154_FILTERING_4_FRAME_FIELDS; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, &pib->filt, filt_level); rcu_read_unlock(); return ret; } static const struct ieee802154_ops hwsim_ops = { .owner = THIS_MODULE, .xmit_async = hwsim_hw_xmit, .ed = hwsim_hw_ed, .set_channel = hwsim_hw_channel, .start = hwsim_hw_start, .stop = hwsim_hw_stop, .set_promiscuous_mode = hwsim_set_promiscuous_mode, .set_hw_addr_filt = hwsim_hw_addr_filt, }; static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { return hwsim_add_one(info, &mac802154hwsim_dev->dev, false); } static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy, *tmp; s64 idx = -1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) { if (idx == phy->idx) { hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return 0; } } mutex_unlock(&hwsim_phys_lock); return -ENODEV; } static int append_radio_msg(struct sk_buff *skb, struct hwsim_phy *phy) { struct nlattr *nl_edges, *nl_edge; struct hwsim_edge_info *einfo; struct hwsim_edge *e; int ret; ret = nla_put_u32(skb, MAC802154_HWSIM_ATTR_RADIO_ID, phy->idx); if (ret < 0) return ret; rcu_read_lock(); if (list_empty(&phy->edges)) { rcu_read_unlock(); return 0; } nl_edges = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGES); if (!nl_edges) { rcu_read_unlock(); return -ENOBUFS; } list_for_each_entry_rcu(e, &phy->edges, list) { nl_edge = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGE); if (!nl_edge) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edges); return -ENOBUFS; } ret = nla_put_u32(skb, MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID, e->endpoint->idx); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } einfo = rcu_dereference(e->info); ret = nla_put_u8(skb, MAC802154_HWSIM_EDGE_ATTR_LQI, einfo->lqi); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } nla_nest_end(skb, nl_edge); } rcu_read_unlock(); nla_nest_end(skb, nl_edges); return 0; } static int hwsim_get_radio(struct sk_buff *skb, struct hwsim_phy *phy, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; int res; hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags, MAC802154_HWSIM_CMD_GET_RADIO); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); res = append_radio_msg(skb, phy); if (res < 0) goto out_err; genlmsg_end(skb, hdr); return 0; out_err: genlmsg_cancel(skb, hdr); return res; } static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy; struct sk_buff *skb; int idx, res = -ENODEV; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx != idx) continue; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) { res = -ENOMEM; goto out_err; } res = hwsim_get_radio(skb, phy, info->snd_portid, info->snd_seq, NULL, 0); if (res < 0) { nlmsg_free(skb); goto out_err; } res = genlmsg_reply(skb, info); break; } out_err: mutex_unlock(&hwsim_phys_lock); return res; } static int hwsim_dump_radio_nl(struct sk_buff *skb, struct netlink_callback *cb) { int idx = cb->args[0]; struct hwsim_phy *phy; int res; mutex_lock(&hwsim_phys_lock); if (idx == hwsim_radio_idx) goto done; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx < idx) continue; res = hwsim_get_radio(skb, phy, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (res < 0) break; idx = phy->idx + 1; } cb->args[0] = idx; done: mutex_unlock(&hwsim_phys_lock); return skb->len; } /* caller need to held hwsim_phys_lock */ static struct hwsim_phy *hwsim_get_radio_by_id(uint32_t idx) { struct hwsim_phy *phy; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx == idx) return phy; } return NULL; } static const struct nla_policy hwsim_edge_policy[MAC802154_HWSIM_EDGE_ATTR_MAX + 1] = { [MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_EDGE_ATTR_LQI] = { .type = NLA_U8 }, }; static struct hwsim_edge *hwsim_alloc_edge(struct hwsim_phy *endpoint, u8 lqi) { struct hwsim_edge_info *einfo; struct hwsim_edge *e; e = kzalloc_obj(*e); if (!e) return NULL; einfo = kzalloc_obj(*einfo); if (!einfo) { kfree(e); return NULL; } einfo->lqi = 0xff; rcu_assign_pointer(e->info, einfo); e->endpoint = endpoint; return e; } static void hwsim_free_edge(struct hwsim_edge *e) { struct hwsim_edge_info *einfo; rcu_read_lock(); einfo = rcu_dereference(e->info); rcu_read_unlock(); kfree_rcu(einfo, rcu); kfree_rcu(e, rcu); } static int hwsim_new_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0, *phy_v1; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); if (v0 == v1) return -EINVAL; mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } phy_v1 = hwsim_get_radio_by_id(v1); if (!phy_v1) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { mutex_unlock(&hwsim_phys_lock); rcu_read_unlock(); return -EEXIST; } } rcu_read_unlock(); e = hwsim_alloc_edge(phy_v1, 0xff); if (!e) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } list_add_rcu(&e->list, &phy_v0->edges); /* wait until changes are done under hwsim_phys_lock lock * should prevent of calling this function twice while * edges list has not the changes yet. */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { rcu_read_unlock(); list_del_rcu(&e->list); hwsim_free_edge(e); /* same again - wait until list changes are done */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; u8 lqi; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] || !edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); lqi = nla_get_u8(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } einfo = kzalloc_obj(*einfo); if (!einfo) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; einfo_old = rcu_replace_pointer(e->info, einfo, lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); kfree(einfo); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } /* MAC802154_HWSIM netlink policy */ static const struct nla_policy hwsim_genl_policy[MAC802154_HWSIM_ATTR_MAX + 1] = { [MAC802154_HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_ATTR_RADIO_EDGE] = { .type = NLA_NESTED }, [MAC802154_HWSIM_ATTR_RADIO_EDGES] = { .type = NLA_NESTED }, }; /* Generic Netlink operations array */ static const struct genl_small_ops hwsim_nl_ops[] = { { .cmd = MAC802154_HWSIM_CMD_NEW_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_GET_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_get_radio_nl, .dumpit = hwsim_dump_radio_nl, }, { .cmd = MAC802154_HWSIM_CMD_NEW_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_SET_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_set_edge_lqi, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC802154_HWSIM", .version = 1, .maxattr = MAC802154_HWSIM_ATTR_MAX, .policy = hwsim_genl_policy, .module = THIS_MODULE, .small_ops = hwsim_nl_ops, .n_small_ops = ARRAY_SIZE(hwsim_nl_ops), .resv_start_op = MAC802154_HWSIM_CMD_NEW_EDGE + 1, .mcgrps = hwsim_mcgrps, .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb, struct genl_info *info) { if (info) genl_notify(&hwsim_genl_family, mcast_skb, info, HWSIM_MCGRP_CONFIG, GFP_KERNEL); else genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0, HWSIM_MCGRP_CONFIG, GFP_KERNEL); } static void hwsim_mcast_new_radio(struct genl_info *info, struct hwsim_phy *phy) { struct sk_buff *mcast_skb; void *data; mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!mcast_skb) return; data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0, MAC802154_HWSIM_CMD_NEW_RADIO); if (!data) goto out_err; if (append_radio_msg(mcast_skb, phy) < 0) goto out_err; genlmsg_end(mcast_skb, data); hwsim_mcast_config_msg(mcast_skb, info); return; out_err: genlmsg_cancel(mcast_skb, data); nlmsg_free(mcast_skb); } static void hwsim_edge_unsubscribe_me(struct hwsim_phy *phy) { struct hwsim_phy *tmp; struct hwsim_edge *e; rcu_read_lock(); /* going to all phy edges and remove phy from it */ list_for_each_entry(tmp, &hwsim_phys, list) { list_for_each_entry_rcu(e, &tmp->edges, list) { if (e->endpoint->idx == phy->idx) { list_del_rcu(&e->list); hwsim_free_edge(e); } } } rcu_read_unlock(); synchronize_rcu(); } static int hwsim_subscribe_all_others(struct hwsim_phy *phy) { struct hwsim_phy *sub; struct hwsim_edge *e; list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(sub, 0xff); if (!e) goto me_fail; list_add_rcu(&e->list, &phy->edges); } list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(phy, 0xff); if (!e) goto sub_fail; list_add_rcu(&e->list, &sub->edges); } return 0; sub_fail: hwsim_edge_unsubscribe_me(phy); me_fail: rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } rcu_read_unlock(); return -ENOMEM; } static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init) { struct ieee802154_hw *hw; struct hwsim_phy *phy; struct hwsim_pib *pib; int idx; int err; idx = hwsim_radio_idx++; hw = ieee802154_alloc_hw(sizeof(*phy), &hwsim_ops); if (!hw) return -ENOMEM; phy = hw->priv; phy->hw = hw; /* 868 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 1; /* 915 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7fe; /* 2.4 GHz O-QPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7FFF800; /* 868 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 1; /* 915 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 0x7fe; /* 868 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 1; /* 915 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 0x7fe; /* 2.4 GHz CSS 802.15.4a-2007 */ hw->phy->supported.channels[3] |= 0x3fff; /* UWB Sub-gigahertz 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 1; /* UWB Low band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0x1e; /* UWB High band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0xffe0; /* 750 MHz O-QPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf; /* 750 MHz MPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf0; /* 950 MHz BPSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ff; /* 950 MHz GFSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ffc00; ieee802154_random_extended_addr(&hw->phy->perm_extended_addr); /* hwsim phy channel 13 as default */ hw->phy->current_channel = 13; pib = kzalloc_obj(*pib); if (!pib) { err = -ENOMEM; goto err_pib; } pib->channel = 13; pib->filt.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); pib->filt.pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); rcu_assign_pointer(phy->pib, pib); phy->idx = idx; INIT_LIST_HEAD(&phy->edges); hw->flags = IEEE802154_HW_PROMISCUOUS; hw->parent = dev; err = ieee802154_register_hw(hw); if (err) goto err_reg; mutex_lock(&hwsim_phys_lock); if (init) { err = hwsim_subscribe_all_others(phy); if (err < 0) { mutex_unlock(&hwsim_phys_lock); goto err_subscribe; } } list_add_tail(&phy->list, &hwsim_phys); mutex_unlock(&hwsim_phys_lock); hwsim_mcast_new_radio(info, phy); return idx; err_subscribe: ieee802154_unregister_hw(phy->hw); err_reg: kfree(pib); err_pib: ieee802154_free_hw(phy->hw); return err; } static void hwsim_del(struct hwsim_phy *phy) { struct hwsim_pib *pib; struct hwsim_edge *e; hwsim_edge_unsubscribe_me(phy); list_del(&phy->list); rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } pib = rcu_dereference(phy->pib); rcu_read_unlock(); kfree_rcu(pib, rcu); ieee802154_unregister_hw(phy->hw); ieee802154_free_hw(phy->hw); } static int hwsim_probe(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; int err, i; for (i = 0; i < 2; i++) { err = hwsim_add_one(NULL, &pdev->dev, true); if (err < 0) goto err_slave; } dev_info(&pdev->dev, "Added 2 mac802154 hwsim hardware radios\n"); return 0; err_slave: mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return err; } static void hwsim_remove(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); } static struct platform_driver mac802154hwsim_driver = { .probe = hwsim_probe, .remove = hwsim_remove, .driver = { .name = "mac802154_hwsim", }, }; static __init int hwsim_init_module(void) { int rc; rc = genl_register_family(&hwsim_genl_family); if (rc) return rc; mac802154hwsim_dev = platform_device_register_simple("mac802154_hwsim", -1, NULL, 0); if (IS_ERR(mac802154hwsim_dev)) { rc = PTR_ERR(mac802154hwsim_dev); goto platform_dev; } rc = platform_driver_register(&mac802154hwsim_driver); if (rc < 0) goto platform_drv; return 0; platform_drv: platform_device_unregister(mac802154hwsim_dev); platform_dev: genl_unregister_family(&hwsim_genl_family); return rc; } static __exit void hwsim_remove_module(void) { genl_unregister_family(&hwsim_genl_family); platform_driver_unregister(&mac802154hwsim_driver); platform_device_unregister(mac802154hwsim_dev); } module_init(hwsim_init_module); module_exit(hwsim_remove_module);
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Shortest Expected Delay scheduling module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: */ /* * The SED algorithm attempts to minimize each job's expected delay until * completion. The expected delay that the job will experience is * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of * jobs on the ith server and Ui is the fixed service rate (weight) of * the ith server. The SED algorithm adopts a greedy policy that each does * what is in its own best interest, i.e. to join the queue which would * minimize its expected delay of completion. * * See the following paper for more information: * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, * pages 986-994, 1988. * * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. * * The difference between SED and WLC is that SED includes the incoming * job in the cost function (the increment of 1). SED may outperform * WLC, while scheduling big jobs under larger heterogeneous systems * (the server weight varies a lot). * */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> static inline int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { /* * We only use the active connection number in the cost * calculation here. */ return atomic_read(&dest->activeconns) + 1; } /* * Weighted Least Connection scheduling */ static struct ip_vs_dest * ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* * We calculate the load of each dest server as follows: * (server expected overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connections. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_sed_dest_overhead(least); goto nextstage; } } ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "SED: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } static struct ip_vs_scheduler ip_vs_sed_scheduler = { .name = "sed", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), .schedule = ip_vs_sed_schedule, }; static int __init ip_vs_sed_init(void) { return register_ip_vs_scheduler(&ip_vs_sed_scheduler); } static void __exit ip_vs_sed_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); synchronize_rcu(); } module_init(ip_vs_sed_init); module_exit(ip_vs_sed_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs shortest expected delay scheduler");
2 1 1 4 4 3 1 1 3 2 2 2 2 2 2 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation -- HMAC functions * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <crypto/sha1.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> #include <linux/random.h> struct hmac_storage { local_lock_t bh_lock; char hmac_ring[SEG6_HMAC_RING_SIZE]; }; static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct seg6_hmac_info *hinfo = obj; return (hinfo->hmackeyid != *(__u32 *)arg->key); } static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo) { kfree_rcu(hinfo, rcu); } static void seg6_free_hi(void *ptr, void *arg) { struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr; if (hinfo) seg6_hinfo_release(hinfo); } static const struct rhashtable_params rht_params = { .head_offset = offsetof(struct seg6_hmac_info, node), .key_offset = offsetof(struct seg6_hmac_info, hmackeyid), .key_len = sizeof(u32), .automatic_shrinking = true, .obj_cmpfn = seg6_hmac_cmpfn, }; static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5) return NULL; if (!sr_has_hmac(srh)) return NULL; tlv = (struct sr6_tlv_hmac *) ((char *)srh + ((srh->hdrlen + 1) << 3) - 40); if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38) return NULL; return tlv; } int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); int plen, i, ret = 0; char *ring, *off; /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; /* this limit allows for 14 segments */ if (plen >= SEG6_HMAC_RING_SIZE) return -EMSGSIZE; /* Let's build the HMAC text on the ring buffer. The text is composed * as follows, in order: * * 1. Source IPv6 address (128 bits) * 2. first_segment value (8 bits) * 3. Flags (8 bits) * 4. HMAC Key ID (32 bits) * 5. All segments in the segments list (n * 128 bits) */ local_bh_disable(); local_lock_nested_bh(&hmac_storage.bh_lock); ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ memcpy(off, saddr, 16); off += 16; /* first_segment value */ *off++ = hdr->first_segment; /* flags */ *off++ = hdr->flags; /* HMAC Key ID */ memcpy(off, &hmackeyid, 4); off += 4; /* all segments in the list */ for (i = 0; i < hdr->first_segment + 1; i++) { memcpy(off, hdr->segments + i, 16); off += 16; } switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); return ret; } EXPORT_SYMBOL(seg6_hmac_compute); /* checks if an incoming SR-enabled packet's HMAC status matches * the incoming policy. * * called with rcu_read_lock() */ bool seg6_hmac_validate_skb(struct sk_buff *skb) { u8 hmac_output[SEG6_HMAC_FIELD_LEN]; struct net *net = dev_net(skb->dev); struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; int require_hmac; idev = __in6_dev_get(skb->dev); if (!idev) return false; srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); tlv = seg6_get_tlv_hmac(srh); require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ if (require_hmac > 0 && !tlv) return false; /* no check */ if (require_hmac < 0) return true; /* check only if present */ if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) return false; if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; } EXPORT_SYMBOL(seg6_hmac_validate_skb); /* called with rcu_read_lock() */ struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); return hinfo; } EXPORT_SYMBOL(seg6_hmac_info_lookup); int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) { struct seg6_pernet_data *sdata = seg6_pernet(net); int err; switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1_preparekey(&hinfo->key.sha1, hinfo->secret, hinfo->slen); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256_preparekey(&hinfo->key.sha256, hinfo->secret, hinfo->slen); break; default: return -EINVAL; } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); return err; } EXPORT_SYMBOL(seg6_hmac_info_add); int seg6_hmac_info_del(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; int err = -ENOENT; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); if (!hinfo) goto out; err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node, rht_params); if (err) goto out; seg6_hinfo_release(hinfo); out: return err; } EXPORT_SYMBOL(seg6_hmac_info_del); int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh) { struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; int err = -ENOENT; tlv = seg6_get_tlv_hmac(srh); if (!tlv) return -EINVAL; rcu_read_lock(); hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) goto out; memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN); err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(seg6_push_hmac); int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); return rhashtable_init(&sdata->hmac_infos, &rht_params); } void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL); } EXPORT_SYMBOL(seg6_hmac_net_exit);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H #include <linux/ns/nstree_types.h> #include <linux/nsproxy.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rculist.h> #include <linux/cookie.h> #include <uapi/linux/nsfs.h> struct ns_common; extern struct ns_tree_root cgroup_ns_tree; extern struct ns_tree_root ipc_ns_tree; extern struct ns_tree_root mnt_ns_tree; extern struct ns_tree_root net_ns_tree; extern struct ns_tree_root pid_ns_tree; extern struct ns_tree_root time_ns_tree; extern struct ns_tree_root user_ns_tree; extern struct ns_tree_root uts_ns_tree; void ns_tree_node_init(struct ns_tree_node *node); void ns_tree_root_init(struct ns_tree_root *root); bool ns_tree_node_empty(const struct ns_tree_node *node); struct rb_node *ns_tree_node_add(struct ns_tree_node *node, struct ns_tree_root *root, int (*cmp)(struct rb_node *, const struct rb_node *)); void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); #define to_ns_tree(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(cgroup_ns_tree), \ struct ipc_namespace *: &(ipc_ns_tree), \ struct net *: &(net_ns_tree), \ struct pid_namespace *: &(pid_ns_tree), \ struct mnt_namespace *: &(mnt_ns_tree), \ struct time_namespace *: &(time_ns_tree), \ struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) #define ns_tree_gen_id(__ns) \ __ns_tree_gen_id(to_ns_common(__ns), \ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, struct ns_tree_root *ns_tree, bool previous); static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) { __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); } /** * ns_tree_add_raw - Add a namespace to a namespace * @__ns: Namespace to add * * This function adds a namespace to the appropriate namespace tree * without assigning a id. */ #define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) /** * ns_tree_add - Add a namespace to a namespace tree * @__ns: Namespace to add * * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ #define ns_tree_add(__ns) \ __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) /** * ns_tree_remove - Remove a namespace from a namespace tree * @__ns: Namespace to remove * * This function removes a namespace from the appropriate namespace * tree and list. */ #define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) #define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) #endif /* _LINUX_NSTREE_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/relay.h * * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com) * * CONFIG_RELAY definitions and declarations */ #ifndef _LINUX_RELAY_H #define _LINUX_RELAY_H #include <linux/types.h> #include <linux/sched.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/irq_work.h> #include <linux/bug.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/kref.h> #include <linux/percpu.h> /* * Tracks changes to rchan/rchan_buf structs */ #define RELAYFS_CHANNEL_VERSION 7 /* * Relay buffer statistics */ enum { RELAY_STATS_BUF_FULL = (1 << 0), RELAY_STATS_WRT_BIG = (1 << 1), RELAY_STATS_LAST = RELAY_STATS_WRT_BIG, }; struct rchan_buf_stats { unsigned int full_count; /* counter for buffer full */ unsigned int big_count; /* counter for too big to write */ }; /* * Per-cpu relay channel buffer */ struct rchan_buf { void *start; /* start of channel buffer */ void *data; /* start of current sub-buffer */ size_t offset; /* current offset into sub-buffer */ size_t subbufs_produced; /* count of sub-buffers produced */ size_t subbufs_consumed; /* count of sub-buffers consumed */ struct rchan *chan; /* associated channel */ wait_queue_head_t read_wait; /* reader wait queue */ struct irq_work wakeup_work; /* reader wakeup */ struct dentry *dentry; /* channel file dentry */ struct kref kref; /* channel buffer refcount */ struct rchan_buf_stats stats; /* buffer stats */ struct page **page_array; /* array of current buffer pages */ unsigned int page_count; /* number of current buffer pages */ unsigned int finalized; /* buffer has been finalized */ size_t *padding; /* padding counts per sub-buffer */ size_t bytes_consumed; /* bytes consumed in cur read subbuf */ size_t early_bytes; /* bytes consumed before VFS inited */ unsigned int cpu; /* this buf's cpu */ } ____cacheline_aligned; /* * Relay channel data structure */ struct rchan { u32 version; /* the version of this struct */ size_t subbuf_size; /* sub-buffer size */ size_t n_subbufs; /* number of sub-buffers per buffer */ size_t alloc_size; /* total buffer size allocated */ const struct rchan_callbacks *cb; /* client callbacks */ struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ struct dentry *parent; /* parent dentry passed to open */ int has_base_filename; /* has a filename associated? */ char base_filename[NAME_MAX]; /* saved base filename */ }; /* * Relay channel client callbacks */ struct rchan_callbacks { /* * subbuf_start - called on buffer-switch to a new sub-buffer * @buf: the channel buffer containing the new sub-buffer * @subbuf: the start of the new sub-buffer * @prev_subbuf: the start of the previous sub-buffer * * The client should return 1 to continue logging, 0 to stop * logging. * * This callback is optional. * * NOTE: subbuf_start will also be invoked when the buffer is * created, so that the first sub-buffer can be initialized * if necessary. In this case, prev_subbuf will be NULL. * * NOTE: the client can reserve bytes at the beginning of the new * sub-buffer by calling subbuf_start_reserve() in this callback. */ int (*subbuf_start) (struct rchan_buf *buf, void *subbuf, void *prev_subbuf); /* * create_buf_file - create file to represent a relay channel buffer * @filename: the name of the file to create * @parent: the parent of the file to create * @mode: the mode of the file to create * @buf: the channel buffer * @is_global: outparam - set non-zero if the buffer should be global * * Called during relay_open(), once for each per-cpu buffer, * to allow the client to create a file to be used to * represent the corresponding channel buffer. If the file is * created outside of relay, the parent must also exist in * that filesystem. * * The callback should return the dentry of the file created * to represent the relay buffer. * * Setting the is_global outparam to a non-zero value will * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * * This callback is mandatory. * * See Documentation/filesystems/relay.rst for more info. */ struct dentry *(*create_buf_file)(const char *filename, struct dentry *parent, umode_t mode, struct rchan_buf *buf, int *is_global); /* * remove_buf_file - remove file representing a relay channel buffer * @dentry: the dentry of the file to remove * * Called during relay_close(), once for each per-cpu buffer, * to allow the client to remove a file used to represent a * channel buffer. * * The callback should return 0 if successful, negative if not. * * This callback is mandatory. */ int (*remove_buf_file)(struct dentry *dentry); }; /* * CONFIG_RELAY kernel API, kernel/relay.c */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, size_t subbuf_size, size_t n_subbufs, const struct rchan_callbacks *cb, void *private_data); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); size_t relay_stats(struct rchan *chan, int flags); extern void relay_subbufs_consumed(struct rchan *chan, unsigned int cpu, size_t consumed); extern void relay_reset(struct rchan *chan); extern int relay_buf_full(struct rchan_buf *buf); extern size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length); /** * relay_write - write data into the channel * @chan: relay channel * @data: data to be written * @length: number of bytes to write * * Writes data into the current cpu's channel buffer. * * Protects the buffer by disabling interrupts. Use this * if you might be logging from interrupt context. Try * __relay_write() if you know you won't be logging from * interrupt context. */ static inline void relay_write(struct rchan *chan, const void *data, size_t length) { unsigned long flags; struct rchan_buf *buf; local_irq_save(flags); buf = *this_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > chan->subbuf_size)) length = relay_switch_subbuf(buf, length); memcpy(buf->data + buf->offset, data, length); buf->offset += length; local_irq_restore(flags); } /** * __relay_write - write data into the channel * @chan: relay channel * @data: data to be written * @length: number of bytes to write * * Writes data into the current cpu's channel buffer. * * Protects the buffer by disabling preemption. Use * relay_write() if you might be logging from interrupt * context. */ static inline void __relay_write(struct rchan *chan, const void *data, size_t length) { struct rchan_buf *buf; buf = *get_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > buf->chan->subbuf_size)) length = relay_switch_subbuf(buf, length); memcpy(buf->data + buf->offset, data, length); buf->offset += length; put_cpu_ptr(chan->buf); } /** * relay_reserve - reserve slot in channel buffer * @chan: relay channel * @length: number of bytes to reserve * * Returns pointer to reserved slot, NULL if full. * * Reserves a slot in the current cpu's channel buffer. * Does not protect the buffer at all - caller must provide * appropriate synchronization. */ static inline void *relay_reserve(struct rchan *chan, size_t length) { void *reserved = NULL; struct rchan_buf *buf = *get_cpu_ptr(chan->buf); if (unlikely(buf->offset + length > buf->chan->subbuf_size)) { length = relay_switch_subbuf(buf, length); if (!length) goto end; } reserved = buf->data + buf->offset; buf->offset += length; end: put_cpu_ptr(chan->buf); return reserved; } /** * subbuf_start_reserve - reserve bytes at the start of a sub-buffer * @buf: relay channel buffer * @length: number of bytes to reserve * * Helper function used to reserve bytes at the beginning of * a sub-buffer in the subbuf_start() callback. */ static inline void subbuf_start_reserve(struct rchan_buf *buf, size_t length) { BUG_ON(length >= buf->chan->subbuf_size - 1); buf->offset = length; } /* * exported relay file operations, kernel/relay.c */ extern const struct file_operations relay_file_operations; #ifdef CONFIG_RELAY int relay_prepare_cpu(unsigned int cpu); #else #define relay_prepare_cpu NULL #endif #endif /* _LINUX_RELAY_H */
1 1 1 1 6 6 6 6 6 4 4 2 2 2 2 2 4 6 4 3 6 4 4 4 6 6 2 2 2 2 2 2 4 4 4 4 3 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 // SPDX-License-Identifier: GPL-2.0-or-later /* XTS: as defined in IEEE1619/D16 * http://grouper.ieee.org/groups/1619/email/pdf00086.pdf * * Copyright (c) 2007 Rik Snel <rsnel@cube.dyndns.org> * * Based on ecb.c * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <crypto/xts.h> #include <crypto/b128ops.h> #include <crypto/gf128mul.h> struct xts_tfm_ctx { struct crypto_skcipher *child; struct crypto_cipher *tweak; }; struct xts_instance_ctx { struct crypto_skcipher_spawn spawn; struct crypto_cipher_spawn tweak_spawn; }; struct xts_request_ctx { le128 t; struct scatterlist *tail; struct scatterlist sg[2]; struct skcipher_request subreq; }; static int xts_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child; struct crypto_cipher *tweak; int err; err = xts_verify_key(parent, key, keylen); if (err) return err; keylen /= 2; /* we need two cipher instances: one to compute the initial 'tweak' * by encrypting the IV (usually the 'plain' iv) and the other * one to encrypt and decrypt the data */ /* tweak cipher, uses Key2 i.e. the second half of *key */ tweak = ctx->tweak; crypto_cipher_clear_flags(tweak, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tweak, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); err = crypto_cipher_setkey(tweak, key + keylen, keylen); if (err) return err; /* data cipher, uses Key1 i.e. the first half of *key */ child = ctx->child; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } /* * We compute the tweak masks twice (both before and after the ECB encryption or * decryption) to avoid having to allocate a temporary buffer and/or make * multiple calls to the 'ecb(..)' instance, which usually would be slower than * just doing the gf128mul_x_ble() calls again. */ static int xts_xor_tweak(struct skcipher_request *req, bool second_pass, bool enc) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); const int bs = XTS_BLOCK_SIZE; struct skcipher_walk w; le128 t = rctx->t; int err; if (second_pass) { req = &rctx->subreq; /* set to our TFM to enforce correct alignment: */ skcipher_request_set_tfm(req, tfm); } err = skcipher_walk_virt(&w, req, false); while (w.nbytes) { unsigned int avail = w.nbytes; const le128 *wsrc; le128 *wdst; wsrc = w.src.virt.addr; wdst = w.dst.virt.addr; do { if (unlikely(cts) && w.total - w.nbytes + avail < 2 * XTS_BLOCK_SIZE) { if (!enc) { if (second_pass) rctx->t = t; gf128mul_x_ble(&t, &t); } le128_xor(wdst, &t, wsrc); if (enc && second_pass) gf128mul_x_ble(&rctx->t, &t); skcipher_walk_done(&w, avail - bs); return 0; } le128_xor(wdst++, &t, wsrc++); gf128mul_x_ble(&t, &t); } while ((avail -= bs) >= bs); err = skcipher_walk_done(&w, avail); } return err; } static int xts_xor_tweak_pre(struct skcipher_request *req, bool enc) { return xts_xor_tweak(req, false, enc); } static int xts_xor_tweak_post(struct skcipher_request *req, bool enc) { return xts_xor_tweak(req, true, enc); } static void xts_cts_done(void *data, int err) { struct skcipher_request *req = data; le128 b; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); scatterwalk_map_and_copy(&b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); le128_xor(&b, &rctx->t, &b); scatterwalk_map_and_copy(&b, rctx->tail, 0, XTS_BLOCK_SIZE, 1); } skcipher_request_complete(req, err); } static int xts_cts_final(struct skcipher_request *req, int (*crypt)(struct skcipher_request *req)) { const struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); int offset = req->cryptlen & ~(XTS_BLOCK_SIZE - 1); struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int tail = req->cryptlen % XTS_BLOCK_SIZE; le128 b[2]; int err; rctx->tail = scatterwalk_ffwd(rctx->sg, req->dst, offset - XTS_BLOCK_SIZE); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); b[1] = b[0]; scatterwalk_map_and_copy(b, req->src, offset, tail, 0); le128_xor(b, &rctx->t, b); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE + tail, 1); skcipher_request_set_tfm(subreq, ctx->child); skcipher_request_set_callback(subreq, req->base.flags, xts_cts_done, req); skcipher_request_set_crypt(subreq, rctx->tail, rctx->tail, XTS_BLOCK_SIZE, NULL); err = crypt(subreq); if (err) return err; scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); le128_xor(b, &rctx->t, b); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 1); return 0; } static void xts_encrypt_done(void *data, int err) { struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, true); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_encrypt); if (err == -EINPROGRESS || err == -EBUSY) return; } } skcipher_request_complete(req, err); } static void xts_decrypt_done(void *data, int err) { struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, false); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_decrypt); if (err == -EINPROGRESS || err == -EBUSY) return; } } skcipher_request_complete(req, err); } static int xts_init_crypt(struct skcipher_request *req, crypto_completion_t compl) { const struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; if (req->cryptlen < XTS_BLOCK_SIZE) return -EINVAL; skcipher_request_set_tfm(subreq, ctx->child); skcipher_request_set_callback(subreq, req->base.flags, compl, req); skcipher_request_set_crypt(subreq, req->dst, req->dst, req->cryptlen & ~(XTS_BLOCK_SIZE - 1), NULL); /* calculate first value of T */ crypto_cipher_encrypt_one(ctx->tweak, (u8 *)&rctx->t, req->iv); return 0; } static int xts_encrypt(struct skcipher_request *req) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int err; err = xts_init_crypt(req, xts_encrypt_done) ?: xts_xor_tweak_pre(req, true) ?: crypto_skcipher_encrypt(subreq) ?: xts_xor_tweak_post(req, true); if (err || likely((req->cryptlen % XTS_BLOCK_SIZE) == 0)) return err; return xts_cts_final(req, crypto_skcipher_encrypt); } static int xts_decrypt(struct skcipher_request *req) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int err; err = xts_init_crypt(req, xts_decrypt_done) ?: xts_xor_tweak_pre(req, false) ?: crypto_skcipher_decrypt(subreq) ?: xts_xor_tweak_post(req, false); if (err || likely((req->cryptlen % XTS_BLOCK_SIZE) == 0)) return err; return xts_cts_final(req, crypto_skcipher_decrypt); } static int xts_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct xts_instance_ctx *ictx = skcipher_instance_ctx(inst); struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child; struct crypto_cipher *tweak; child = crypto_spawn_skcipher(&ictx->spawn); if (IS_ERR(child)) return PTR_ERR(child); ctx->child = child; tweak = crypto_spawn_cipher(&ictx->tweak_spawn); if (IS_ERR(tweak)) { crypto_free_skcipher(ctx->child); return PTR_ERR(tweak); } ctx->tweak = tweak; crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(child) + sizeof(struct xts_request_ctx)); return 0; } static void xts_exit_tfm(struct crypto_skcipher *tfm) { struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); crypto_free_cipher(ctx->tweak); } static void xts_free_instance(struct skcipher_instance *inst) { struct xts_instance_ctx *ictx = skcipher_instance_ctx(inst); crypto_drop_skcipher(&ictx->spawn); crypto_drop_cipher(&ictx->tweak_spawn); kfree(inst); } static int xts_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_alg_common *alg; char name[CRYPTO_MAX_ALG_NAME]; struct skcipher_instance *inst; struct xts_instance_ctx *ctx; const char *cipher_name; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(&ctx->spawn, skcipher_crypto_instance(inst), cipher_name, 0, mask); if (err == -ENOENT && memcmp(cipher_name, "ecb(", 4)) { err = -ENAMETOOLONG; if (snprintf(name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; err = crypto_grab_skcipher(&ctx->spawn, skcipher_crypto_instance(inst), name, 0, mask); } if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(&ctx->spawn); err = -EINVAL; if (alg->base.cra_blocksize != XTS_BLOCK_SIZE) goto err_free_inst; if (alg->ivsize) goto err_free_inst; err = crypto_inst_setname(skcipher_crypto_instance(inst), "xts", &alg->base); if (err) goto err_free_inst; err = -EINVAL; cipher_name = alg->base.cra_name; /* Alas we screwed up the naming so we have to mangle the * cipher name. */ if (!memcmp(cipher_name, "ecb(", 4)) { int len; len = strscpy(name, cipher_name + 4, sizeof(name)); if (len < 2) goto err_free_inst; if (name[len - 1] != ')') goto err_free_inst; name[len - 1] = 0; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "xts(%s)", name) >= CRYPTO_MAX_ALG_NAME) { err = -ENAMETOOLONG; goto err_free_inst; } } else goto err_free_inst; err = crypto_grab_cipher(&ctx->tweak_spawn, skcipher_crypto_instance(inst), name, 0, mask); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = XTS_BLOCK_SIZE; inst->alg.base.cra_alignmask = alg->base.cra_alignmask | (__alignof__(u64) - 1); inst->alg.ivsize = XTS_BLOCK_SIZE; inst->alg.min_keysize = alg->min_keysize * 2; inst->alg.max_keysize = alg->max_keysize * 2; inst->alg.base.cra_ctxsize = sizeof(struct xts_tfm_ctx); inst->alg.init = xts_init_tfm; inst->alg.exit = xts_exit_tfm; inst->alg.setkey = xts_setkey; inst->alg.encrypt = xts_encrypt; inst->alg.decrypt = xts_decrypt; inst->free = xts_free_instance; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: xts_free_instance(inst); } return err; } static struct crypto_template xts_tmpl = { .name = "xts", .create = xts_create, .module = THIS_MODULE, }; static int __init xts_module_init(void) { return crypto_register_template(&xts_tmpl); } static void __exit xts_module_exit(void) { crypto_unregister_template(&xts_tmpl); } module_init(xts_module_init); module_exit(xts_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("XTS block cipher mode"); MODULE_ALIAS_CRYPTO("xts"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); MODULE_SOFTDEP("pre: ecb");
187 189 284 282 280 1 277 684 680 682 3 243 15 15 61 63 415 626 672 245 43 114 4 1 198 146 674 77 8 8 2 674 5 1 4 11 11 1 1 2 2 8 8 1 9 7 2 9 674 1 530 680 678 155 9 3 27 2 5 108 156 2 4370 4366 1704 157 4371 2994 313 310 2 4364 1093 274 1 767 768 1 4350 3 1 123 6 1 189 189 624 525 627 2339 2334 1084 1079 849 2148 2061 2064 181 289 280 283 284 284 1213 12 1201 2989 2988 2988 4374 687 156 86 85 27 4911 4903 4841 4954 4445 4412 162 1 4411 4400 4368 4878 768 768 762 4925 143 12 12 12 11 12 12 12 4858 98 84 98 1304 1307 1310 1298 7 1296 1308 1303 186 185 180 185 186 625 13 625 9 9 9 824 835 830 828 832 2105 1257 312 863 864 790 792 791 14 14 2090 1238 43 2111 2091 2086 1231 1244 1235 1238 44 44 44 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
199 198 199 194 199 362 142 1 142 143 37 37 8 8 37 360 363 223 362 356 179 144 37 364 342 191 163 343 344 333 331 329 335 11 5 344 333 185 327 364 362 179 178 179 177 363 364 178 358 301 302 300 303 298 303 302 318 317 1 1 1 318 302 3 302 59 162 162 59 47 187 236 237 186 186 110 109 120 90 4 4 4 4 2 2 2 2 1 2 1 1 1 2 3 3 3 3 51 53 53 53 52 51 51 54 7 7 34 33 34 34 33 35 35 34 35 7 7 409 410 410 402 409 42 42 408 407 411 411 406 215 214 214 214 215 1 357 358 355 185 353 357 356 86 4 84 261 262 151 262 262 262 262 258 6 6 551 553 427 256 17 12 156 156 197 196 196 1 25 141 142 126 142 145 145 20 142 142 141 3 3 145 1 1 1 1 1 1 1 145 304 299 303 301 303 325 326 145 323 324 302 301 304 14 304 303 304 300 293 304 302 166 304 297 302 302 299 304 300 301 302 414 304 304 17 304 304 167 417 376 161 161 4 418 304 142 143 141 142 9 7 19 3 19 4 2 2 2 2 4 2 2 2 4 9 6 9 9 9 6 9 416 413 416 413 196 195 3 195 195 24 23 24 24 22 22 22 2 2 2 9 6 8 8 8 8 9 6 9 7 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_generic.c Generic packet scheduler routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 * - Ingress support */ #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <linux/bpf.h> #include <trace/events/qdisc.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/hotdata.h> #include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq, struct net_device *dev) { while (skb) { u32 reason = tc_skb_cb(skb)->drop_reason; struct sk_buff *next = skb->next; enum skb_drop_reason skb_reason; prefetch(next); /* TC classifier and qdisc share drop_reason storage. * Check subsystem mask to identify qdisc drop reasons, * else pass through skb_drop_reason set by TC classifier. */ if ((reason & SKB_DROP_REASON_SUBSYS_MASK) == __QDISC_DROP_REASON) { trace_qdisc_drop(q, txq, dev, skb, (enum qdisc_drop_reason)reason); skb_reason = SKB_DROP_REASON_QDISC_DROP; } else { skb_reason = (enum skb_drop_reason)reason; } kfree_skb_reason(skb, skb_reason); skb = next; } } EXPORT_SYMBOL(__tcf_kfree_skb_list); static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { clear_bit(__QDISC_STATE_MISSED, &q->state); /* Make sure the below netif_xmit_frozen_or_stopped() * checking happens after clearing STATE_MISSED. */ smp_mb__after_atomic(); /* Checking netif_xmit_frozen_or_stopped() again to * make sure STATE_MISSED is set if the STATE_MISSED * set by netif_tx_wake_queue()'s rescheduling of * net_tx_action() is cleared by the above clear_bit(). */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); else set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with * qdisc_lock(qdisc) spinlock. * * The idea is the following: * - enqueue, dequeue are serialized via qdisc root lock * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; spinlock_t *lock = NULL; struct sk_buff *skb; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->skb_bad_txq); if (skb) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = SKB_XOFF_MAGIC; qdisc_maybe_clear_missed(q, txq); } } if (lock) spin_unlock(lock); return skb; } static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) { struct sk_buff *skb = skb_peek(&q->skb_bad_txq); if (unlikely(skb)) skb = __skb_dequeue_bad_txq(q); return skb; } static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, struct sk_buff *skb) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } __skb_queue_tail(&q->skb_bad_txq, skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } if (lock) spin_unlock(lock); } static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); /* it's still part of the queue */ if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } skb = next; } if (lock) { spin_unlock(lock); set_bit(__QDISC_STATE_MISSED, &q->state); } else { __netif_schedule(q); } } static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, int *packets, int budget) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; int cnt = 0; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); if (!nskb) break; bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; if (++cnt >= budget) break; } (*packets) += cnt; skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure * all skbs in the chain are for the same txq */ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, struct sk_buff *skb, int *packets) { int mapping = skb_get_queue_mapping(skb); struct sk_buff *nskb; int cnt = 0; do { nskb = q->dequeue(q); if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); break; } skb->next = nskb; skb = nskb; } while (++cnt < 8); (*packets) += cnt; skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets, int budget) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->gso_skb); /* skb may be null if another cpu pulls gso_skb off in between * empty check and lock. */ if (!skb) { if (lock) spin_unlock(lock); goto validate; } /* skb in gso_skb were already validated */ *validate = false; if (xfrm_offload(skb)) *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = NULL; qdisc_maybe_clear_missed(q, txq); } if (lock) spin_unlock(lock); goto trace; } validate: *validate = true; if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) { qdisc_maybe_clear_missed(q, txq); return skb; } skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { if (skb == SKB_XOFF_MAGIC) return NULL; goto bulk; } skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) try_bulk_dequeue_skb(q, skb, txq, packets, budget); else try_bulk_dequeue_skb_slow(q, skb, packets); } trace: trace_qdisc_dequeue(q, txq, *packets, skb); return skb; } /* * Transmit possibly several skbs, and handle the return status as * required. Owning qdisc running bit guarantees that only one CPU * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff * true - feel free to send more pkts */ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; bool again = false; /* And release qdisc */ if (root_lock) spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) skb = validate_xmit_skb_list(skb, dev, &again); #ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); dev_requeue_skb(skb, q); return false; } #endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); HARD_TX_UNLOCK(dev, txq); } else { if (root_lock) spin_lock(root_lock); return true; } if (root_lock) spin_lock(root_lock); if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); dev_requeue_skb(skb, q); return false; } return true; } /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * * netif_tx_lock serializes accesses to device driver. * * qdisc_lock(q) and netif_tx_lock are mutually exclusive, * if one is grabbed, another must be free. * * Note, that this procedure can be called by a watchdog timer * * Returns to the caller: * 0 - queue is empty or throttled. * >0 - queue is not empty. * */ static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets, budget); if (unlikely(!skb)) return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets, quota)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); else __netif_schedule(q); break; } } } unsigned long dev_trans_start(struct net_device *dev) { unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); unsigned long val; unsigned int i; for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } return res; } EXPORT_SYMBOL(dev_trans_start); static void netif_freeze_queues(struct net_device *dev) { unsigned int i; int cpu; cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* We are the only thread of execution doing a * freeze, but we have to grab the _xmit_lock in * order to synchronize with threads which are in * the ->hard_start_xmit() handler and already * checked the frozen bit. */ __netif_tx_lock(txq, cpu); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } } void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->tx_global_lock); netif_freeze_queues(dev); } EXPORT_SYMBOL(netif_tx_lock); static void netif_unfreeze_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* No need to grab the _xmit_lock here. If the * queue is not stopped for another reason, we * force a schedule. */ clear_bit(__QUEUE_STATE_FROZEN, &txq->state); netif_schedule_queue(txq); } } void netif_tx_unlock(struct net_device *dev) { netif_unfreeze_queues(dev); spin_unlock(&dev->tx_global_lock); } EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); if (!netif_xmit_stopped(txq)) continue; /* Paired with WRITE_ONCE() + smp_mb...() in * netdev_tx_sent_queue() and netif_tx_stop_queue(). */ smp_mb(); trans_start = READ_ONCE(txq->trans_start); if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } if (time_after(oldest_start, trans_start)) oldest_start = trans_start; } if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } } spin_unlock(&dev->tx_global_lock); if (release) netdev_put(dev, &dev->watchdog_dev_tracker); } void netdev_watchdog_up(struct net_device *dev) { if (!dev->netdev_ops->ndo_tx_timeout) return; if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) netdev_hold(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(netdev_watchdog_up); static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (timer_delete(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } /** * netif_carrier_on - set carrier * @dev: network device * * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier * @dev: network device * * Device has detected loss of carrier. */ void netif_carrier_off(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); /** * netif_carrier_event - report carrier state event * @dev: network device * * Device has detected a carrier event but the carrier state wasn't changed. * Use in drivers when querying carrier state asynchronously, to avoid missing * events (link flaps) if link recovers before it's queried. */ void netif_carrier_event(struct net_device *dev) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } EXPORT_SYMBOL_GPL(netif_carrier_event); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; static struct netdev_queue noop_netdev_queue = { RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), }; struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), }, .skb_bad_txq = { .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, }; EXPORT_SYMBOL(noop_qdisc); static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such * if this is NULL - so clear it here. */ qdisc->enqueue = NULL; return 0; } struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; EXPORT_SYMBOL(sch_default_prio2band); /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. */ #define PFIFO_FAST_BANDS 3 /* * Private data for a pfifo_fast scheduler containing: * - rings for priority bands */ struct pfifo_fast_priv { struct skb_array q[PFIFO_FAST_BANDS]; }; static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, int band) { return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); if (unlikely(err)) { tcf_set_qdisc_drop_reason(skb, QDISC_DROP_OVERLIMIT); if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else return qdisc_drop(skb, qdisc, to_free); } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; bool need_retry = true; int band; retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); if (__skb_array_empty(q)) continue; skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. */ smp_mb__after_atomic(); need_retry = false; goto retry; } return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; int band; for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); skb = __skb_array_peek(q); } return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (band = 0; band < PFIFO_FAST_BANDS; band++) { struct skb_array *q = band2list(priv, band); struct sk_buff *skb; /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; while ((skb = __skb_array_consume(q)) != NULL) rtnl_kfree_skbs(skb, skb); } if (qdisc_is_percpu_stats(qdisc)) { for_each_possible_cpu(i) { struct gnet_stats_queue *q; q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; q->qlen = 0; } } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int prio; /* guard against zero length rings */ if (!qlen) return -EINVAL; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); int err; err = skb_array_init(q, qlen, GFP_KERNEL); if (err) return -ENOMEM; } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } static void pfifo_fast_destroy(struct Qdisc *sch) { struct pfifo_fast_priv *priv = qdisc_priv(sch); int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; /* Destroy ring but no need to kfree_skb because a call to * pfifo_fast_reset() has already done that work. */ ptr_ring_cleanup(&q->ring, NULL); } } static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, unsigned int new_len) { struct pfifo_fast_priv *priv = qdisc_priv(sch); struct skb_array *bands[PFIFO_FAST_BANDS]; int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); bands[prio] = q; } return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) { struct Qdisc *sch; unsigned int size = sizeof(*sch) + ops->priv_size; int err = -ENOBUFS; struct net_device *dev; if (!dev_queue) { NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } dev = dev_queue->dev; sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!sch) goto errout; __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); qdisc_lock_init(sch, ops); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!sch->cpu_qstats) { free_percpu(sch->cpu_bstats); goto errout1; } } /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: qdisc_lock_uninit(sch, ops); kfree(sch); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, unsigned int parentid, struct netlink_ext_ack *extack) { struct Qdisc *sch; if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL, extack) == 0) { trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; } qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); /* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; trace_qdisc_reset(qdisc); if (ops->reset) ops->reset(qdisc); __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq); qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); } kfree(qdisc); } static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); qdisc_free(q); } static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); qdisc_reset(qdisc); if (ops->destroy) ops->destroy(qdisc); qdisc_lock_uninit(qdisc, ops); bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } void qdisc_destroy(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; __qdisc_destroy(qdisc); } void qdisc_put(struct Qdisc *qdisc) { if (!qdisc) return; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); } EXPORT_SYMBOL(qdisc_put); /* Version of qdisc_put() that is called with rtnl mutex unlocked. * Intended to be used as optimization, this function only takes rtnl lock if * qdisc reference counter reached zero. */ void qdisc_put_unlocked(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); rtnl_unlock(); } EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc) { struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); spinlock_t *root_lock; root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); return oqdisc; } EXPORT_SYMBOL(dev_graft_qdisc); static void shutdown_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { rcu_assign_pointer(dev_queue->qdisc, qdisc_default); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); qdisc_put(qdisc); } } static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; const struct Qdisc_ops *ops = default_qdisc_ops; if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; else if(dev->type == ARPHRD_CAN) ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) return; if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } static void attach_default_qdiscs(struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED if (qdisc != &noop_qdisc) qdisc_hash_add(qdisc, false); #endif } static void transition_one_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_need_watchdog) { struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); int *need_watchdog_p = _need_watchdog; if (!(new_qdisc->flags & TCQ_F_BUILTIN)) clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } void dev_activate(struct net_device *dev) { int need_watchdog; /* No queueing discipline is attached to device; * create default one for devices, which need queueing * and noqueue_qdisc for virtual interfaces */ if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); if (dev_ingress_queue(dev)) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { netif_trans_update(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); static void qdisc_deactivate(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); } static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_sync_needed) { bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { if (qdisc->enqueue) *sync_needed = true; qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; spinlock_t *root_lock; struct Qdisc *q; int val; dev_queue = netdev_get_tx_queue(dev, i); q = rtnl_dereference(dev_queue->qdisc_sleeping); root_lock = qdisc_lock(q); spin_lock_bh(root_lock); val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); if (val) return true; } return false; } /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate * @reset_needed: qdisc should be reset if true. * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ void dev_deactivate_many(struct list_head *head, bool reset_needed) { bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), &sync_needed); netdev_watchdog_down(dev); } /* Wait for outstanding qdisc enqueuing calls. */ if (sync_needed) synchronize_net(); if (reset_needed) { list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); if (dev_ingress_queue(dev)) dev_reset_queue(dev, dev_ingress_queue(dev), NULL); } } /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) { /* wait_event() would avoid this sleep-loop but would * require expensive checks in the fast paths of packet * processing which isn't worth it. */ schedule_timeout_uninterruptible(1); } } } void dev_deactivate(struct net_device *dev, bool reset_needed) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_deactivate_many(&single, reset_needed); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); static int qdisc_change_tx_queue_len(struct net_device *dev, struct netdev_queue *dev_queue) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); const struct Qdisc_ops *ops = qdisc->ops; if (ops->change_tx_queue_len) return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); return 0; } void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); } void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) { #ifdef CONFIG_NET_SCHED struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int i; for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); /* Only update the default qdiscs we created, * qdiscs with handles are always hashed. */ if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_del(qdisc); } for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_add(qdisc, false); } #endif } EXPORT_SYMBOL(mq_change_real_num_tx); int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; unsigned int i; int ret = 0; if (up) dev_deactivate(dev, false); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); /* TODO: revert changes on a partial failure */ if (ret) break; } if (up) dev_activate(dev); return ret; } static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) { struct Qdisc *qdisc = _qdisc; rcu_assign_pointer(dev_queue->qdisc, qdisc); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } void dev_init_scheduler(struct net_device *dev) { rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_put(rtnl_dereference(dev->qdisc)); rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } /** * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division * @rate: Rate to compute reciprocal division values of * @mult: Multiplier for reciprocal division * @shift: Shift for reciprocal division * * The multiplier and shift for reciprocal division by rate are stored * in mult and shift. * * The deal here is to replace a divide by a reciprocal one * in fast path (a reciprocal divide is a multiply and a shift) * * Normal formula would be : * time_in_ns = (NSEC_PER_SEC * len) / rate_bps * * We compute mult/shift to use instead : * time_in_ns = (len * mult) >> shift; * * We try to get the highest possible mult value for accuracy, * but have to make sure no overflows will ever happen. * * reciprocal_value() is not used here it doesn't handle 64-bit values. */ static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) { u64 factor = NSEC_PER_SEC; *mult = 1; *shift = 0; if (rate <= 0) return; for (;;) { *mult = div64_u64(factor, rate); if (*mult & (1U << 31) || factor & (1ULL << 63)) break; factor <<= 1; (*shift)++; } } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) { r->rate_pkts_ps = pktrate64; psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ppscfg_precompute); void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { /* Protected with chain0->filter_chain_lock. * Can't access chain directly because tp_head can be NULL. */ struct mini_Qdisc *miniq_old = rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); } else { miniq = miniq_old != &miniqp->miniq1 ? &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq * we are about to modify. So ensure that at least one RCU * grace period has elapsed since the miniq was made * inactive. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) cond_synchronize_rcu(miniq->rcu_state); else if (!poll_state_synchronize_rcu(miniq->rcu_state)) synchronize_rcu_expedited(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); } if (miniq_old) /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block) { miniqp->miniq1.block = block; miniqp->miniq2.block = block; } EXPORT_SYMBOL(mini_qdisc_pair_block_init); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init);
2 2 1 2 2 1 3 3 1 3 2 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/fadvise.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "advise.h" struct io_fadvise { struct file *file; u64 offset; u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; u64 len; u32 advice; }; int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->off); if (!ma->len) ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; #endif } int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif } static bool io_fadvise_force_async(struct io_fadvise *fa) { switch (fa->advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: return false; default: return true; } } int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->addr); if (!fa->len) fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0 /* * Provides code common for host and device side USB. * * If either host side (ie. CONFIG_USB=y) or device side USB stack * (ie. CONFIG_USB_GADGET=y) is compiled in the kernel, this module is * compiled-in as well. Otherwise, if either of the two stacks is * compiled as module, this file is compiled as module as well. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/usb/ch9.h> #include <linux/usb/of.h> #include <linux/usb/otg.h> #include <linux/of_platform.h> #include <linux/debugfs.h> #include "common.h" static const char *const ep_type_names[] = { [USB_ENDPOINT_XFER_CONTROL] = "ctrl", [USB_ENDPOINT_XFER_ISOC] = "isoc", [USB_ENDPOINT_XFER_BULK] = "bulk", [USB_ENDPOINT_XFER_INT] = "intr", }; /** * usb_ep_type_string() - Returns human readable-name of the endpoint type. * @ep_type: The endpoint type to return human-readable name for. If it's not * any of the types: USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT}, * usually got by usb_endpoint_type(), the string 'unknown' will be returned. */ const char *usb_ep_type_string(int ep_type) { if (ep_type < 0 || ep_type >= ARRAY_SIZE(ep_type_names)) return "unknown"; return ep_type_names[ep_type]; } EXPORT_SYMBOL_GPL(usb_ep_type_string); /** * usb_otg_state_string() - returns human readable name of OTG state. * @state: the OTG state to return the human readable name of. If it's not * any of the states defined in usb_otg_state enum, 'UNDEFINED' will be * returned. */ const char *usb_otg_state_string(enum usb_otg_state state) { static const char *const names[] = { [OTG_STATE_A_IDLE] = "a_idle", [OTG_STATE_A_WAIT_VRISE] = "a_wait_vrise", [OTG_STATE_A_WAIT_BCON] = "a_wait_bcon", [OTG_STATE_A_HOST] = "a_host", [OTG_STATE_A_SUSPEND] = "a_suspend", [OTG_STATE_A_PERIPHERAL] = "a_peripheral", [OTG_STATE_A_WAIT_VFALL] = "a_wait_vfall", [OTG_STATE_A_VBUS_ERR] = "a_vbus_err", [OTG_STATE_B_IDLE] = "b_idle", [OTG_STATE_B_SRP_INIT] = "b_srp_init", [OTG_STATE_B_PERIPHERAL] = "b_peripheral", [OTG_STATE_B_WAIT_ACON] = "b_wait_acon", [OTG_STATE_B_HOST] = "b_host", }; if (state < 0 || state >= ARRAY_SIZE(names)) return "UNDEFINED"; return names[state]; } EXPORT_SYMBOL_GPL(usb_otg_state_string); static const char *const speed_names[] = { [USB_SPEED_UNKNOWN] = "UNKNOWN", [USB_SPEED_LOW] = "low-speed", [USB_SPEED_FULL] = "full-speed", [USB_SPEED_HIGH] = "high-speed", [USB_SPEED_WIRELESS] = "wireless", [USB_SPEED_SUPER] = "super-speed", [USB_SPEED_SUPER_PLUS] = "super-speed-plus", }; static const char *const ssp_rate[] = { [USB_SSP_GEN_UNKNOWN] = "UNKNOWN", [USB_SSP_GEN_2x1] = "super-speed-plus-gen2x1", [USB_SSP_GEN_1x2] = "super-speed-plus-gen1x2", [USB_SSP_GEN_2x2] = "super-speed-plus-gen2x2", }; /** * usb_speed_string() - Returns human readable-name of the speed. * @speed: The speed to return human-readable name for. If it's not * any of the speeds defined in usb_device_speed enum, string for * USB_SPEED_UNKNOWN will be returned. */ const char *usb_speed_string(enum usb_device_speed speed) { if (speed < 0 || speed >= ARRAY_SIZE(speed_names)) speed = USB_SPEED_UNKNOWN; return speed_names[speed]; } EXPORT_SYMBOL_GPL(usb_speed_string); /** * usb_get_maximum_speed - Get maximum requested speed for a given USB * controller. * @dev: Pointer to the given USB controller device * * The function gets the maximum speed string from property "maximum-speed", * and returns the corresponding enum usb_device_speed. */ enum usb_device_speed usb_get_maximum_speed(struct device *dev) { const char *p = "maximum-speed"; int ret; ret = device_property_match_property_string(dev, p, ssp_rate, ARRAY_SIZE(ssp_rate)); if (ret > 0) return USB_SPEED_SUPER_PLUS; ret = device_property_match_property_string(dev, p, speed_names, ARRAY_SIZE(speed_names)); if (ret > 0) return ret; return USB_SPEED_UNKNOWN; } EXPORT_SYMBOL_GPL(usb_get_maximum_speed); /** * usb_get_maximum_ssp_rate - Get the signaling rate generation and lane count * of a SuperSpeed Plus capable device. * @dev: Pointer to the given USB controller device * * If the string from "maximum-speed" property is super-speed-plus-genXxY where * 'X' is the generation number and 'Y' is the number of lanes, then this * function returns the corresponding enum usb_ssp_rate. */ enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev) { const char *maximum_speed; int ret; ret = device_property_read_string(dev, "maximum-speed", &maximum_speed); if (ret < 0) return USB_SSP_GEN_UNKNOWN; ret = match_string(ssp_rate, ARRAY_SIZE(ssp_rate), maximum_speed); return (ret < 0) ? USB_SSP_GEN_UNKNOWN : ret; } EXPORT_SYMBOL_GPL(usb_get_maximum_ssp_rate); /** * usb_state_string - Returns human readable name for the state. * @state: The state to return a human-readable name for. If it's not * any of the states devices in usb_device_state_string enum, * the string UNKNOWN will be returned. */ const char *usb_state_string(enum usb_device_state state) { static const char *const names[] = { [USB_STATE_NOTATTACHED] = "not attached", [USB_STATE_ATTACHED] = "attached", [USB_STATE_POWERED] = "powered", [USB_STATE_RECONNECTING] = "reconnecting", [USB_STATE_UNAUTHENTICATED] = "unauthenticated", [USB_STATE_DEFAULT] = "default", [USB_STATE_ADDRESS] = "addressed", [USB_STATE_CONFIGURED] = "configured", [USB_STATE_SUSPENDED] = "suspended", }; if (state < 0 || state >= ARRAY_SIZE(names)) return "UNKNOWN"; return names[state]; } EXPORT_SYMBOL_GPL(usb_state_string); static const char *const usb_dr_modes[] = { [USB_DR_MODE_UNKNOWN] = "", [USB_DR_MODE_HOST] = "host", [USB_DR_MODE_PERIPHERAL] = "peripheral", [USB_DR_MODE_OTG] = "otg", }; /** * usb_get_dr_mode_from_string() - Get dual role mode for given string * @str: String to find the corresponding dual role mode for * * This function performs a lookup for the given string and returns the * corresponding enum usb_dr_mode. If no match for the string could be found, * 'USB_DR_MODE_UNKNOWN' is returned. */ static enum usb_dr_mode usb_get_dr_mode_from_string(const char *str) { int ret; ret = match_string(usb_dr_modes, ARRAY_SIZE(usb_dr_modes), str); return (ret < 0) ? USB_DR_MODE_UNKNOWN : ret; } enum usb_dr_mode usb_get_dr_mode(struct device *dev) { const char *dr_mode; int err; err = device_property_read_string(dev, "dr_mode", &dr_mode); if (err < 0) return USB_DR_MODE_UNKNOWN; return usb_get_dr_mode_from_string(dr_mode); } EXPORT_SYMBOL_GPL(usb_get_dr_mode); /** * usb_get_role_switch_default_mode - Get default mode for given device * @dev: Pointer to the given device * * The function gets string from property 'role-switch-default-mode', * and returns the corresponding enum usb_dr_mode. */ enum usb_dr_mode usb_get_role_switch_default_mode(struct device *dev) { const char *str; int ret; ret = device_property_read_string(dev, "role-switch-default-mode", &str); if (ret < 0) return USB_DR_MODE_UNKNOWN; return usb_get_dr_mode_from_string(str); } EXPORT_SYMBOL_GPL(usb_get_role_switch_default_mode); /** * usb_decode_interval - Decode bInterval into the time expressed in 1us unit * @epd: The descriptor of the endpoint * @speed: The speed that the endpoint works as * * Function returns the interval expressed in 1us unit for servicing * endpoint for data transfers. */ unsigned int usb_decode_interval(const struct usb_endpoint_descriptor *epd, enum usb_device_speed speed) { unsigned int interval = 0; switch (usb_endpoint_type(epd)) { case USB_ENDPOINT_XFER_CONTROL: /* uframes per NAK */ if (speed == USB_SPEED_HIGH) interval = epd->bInterval; break; case USB_ENDPOINT_XFER_ISOC: interval = 1 << (epd->bInterval - 1); break; case USB_ENDPOINT_XFER_BULK: /* uframes per NAK */ if (speed == USB_SPEED_HIGH && usb_endpoint_dir_out(epd)) interval = epd->bInterval; break; case USB_ENDPOINT_XFER_INT: if (speed >= USB_SPEED_HIGH) interval = 1 << (epd->bInterval - 1); else interval = epd->bInterval; break; } interval *= (speed >= USB_SPEED_HIGH) ? 125 : 1000; return interval; } EXPORT_SYMBOL_GPL(usb_decode_interval); #ifdef CONFIG_OF /** * of_usb_get_dr_mode_by_phy - Get dual role mode for the controller device * which is associated with the given phy device_node * @np: Pointer to the given phy device_node * @arg0: phandle args[0] for phy's with #phy-cells >= 1, or -1 for * phys which do not have phy-cells * * In dts a usb controller associates with phy devices. The function gets * the string from property 'dr_mode' of the controller associated with the * given phy device node, and returns the correspondig enum usb_dr_mode. */ enum usb_dr_mode of_usb_get_dr_mode_by_phy(struct device_node *np, int arg0) { struct device_node *controller; struct of_phandle_args args; const char *dr_mode; int index; int err; for_each_node_with_property(controller, "phys") { if (!of_device_is_available(controller)) continue; index = 0; do { if (arg0 == -1) { args.np = of_parse_phandle(controller, "phys", index); args.args_count = 0; } else { err = of_parse_phandle_with_args(controller, "phys", "#phy-cells", index, &args); if (err) break; } of_node_put(args.np); if (args.np == np && (args.args_count == 0 || args.args[0] == arg0)) goto finish; index++; } while (args.np); } finish: err = of_property_read_string(controller, "dr_mode", &dr_mode); of_node_put(controller); if (err < 0) return USB_DR_MODE_UNKNOWN; return usb_get_dr_mode_from_string(dr_mode); } EXPORT_SYMBOL_GPL(of_usb_get_dr_mode_by_phy); /** * of_usb_host_tpl_support - to get if Targeted Peripheral List is supported * for given targeted hosts (non-PC hosts) * @np: Pointer to the given device_node * * The function gets if the targeted hosts support TPL or not */ bool of_usb_host_tpl_support(struct device_node *np) { return of_property_read_bool(np, "tpl-support"); } EXPORT_SYMBOL_GPL(of_usb_host_tpl_support); /** * of_usb_update_otg_caps - to update usb otg capabilities according to * the passed properties in DT. * @np: Pointer to the given device_node * @otg_caps: Pointer to the target usb_otg_caps to be set * * The function updates the otg capabilities */ int of_usb_update_otg_caps(struct device_node *np, struct usb_otg_caps *otg_caps) { u32 otg_rev; if (!otg_caps) return -EINVAL; if (!of_property_read_u32(np, "otg-rev", &otg_rev)) { switch (otg_rev) { case 0x0100: case 0x0120: case 0x0130: case 0x0200: /* Choose the lesser one if it's already been set */ if (otg_caps->otg_rev) otg_caps->otg_rev = min_t(u16, otg_rev, otg_caps->otg_rev); else otg_caps->otg_rev = otg_rev; break; default: pr_err("%pOF: unsupported otg-rev: 0x%x\n", np, otg_rev); return -EINVAL; } } else { /* * otg-rev is mandatory for otg properties, if not passed * we set it to be 0 and assume it's a legacy otg device. * Non-dt platform can set it afterwards. */ otg_caps->otg_rev = 0; } if (of_property_read_bool(np, "hnp-disable")) otg_caps->hnp_support = false; if (of_property_read_bool(np, "srp-disable")) otg_caps->srp_support = false; if (of_property_read_bool(np, "adp-disable") || (otg_caps->otg_rev < 0x0200)) otg_caps->adp_support = false; return 0; } EXPORT_SYMBOL_GPL(of_usb_update_otg_caps); /** * usb_of_get_companion_dev - Find the companion device * @dev: the device pointer to find a companion * * Find the companion device from platform bus. * * Takes a reference to the returned struct device which needs to be dropped * after use. * * Return: On success, a pointer to the companion device, %NULL on failure. */ struct device *usb_of_get_companion_dev(struct device *dev) { struct device_node *node; struct platform_device *pdev = NULL; node = of_parse_phandle(dev->of_node, "companion", 0); if (node) pdev = of_find_device_by_node(node); of_node_put(node); return pdev ? &pdev->dev : NULL; } EXPORT_SYMBOL_GPL(usb_of_get_companion_dev); #endif struct dentry *usb_debug_root; EXPORT_SYMBOL_GPL(usb_debug_root); DEFINE_MUTEX(usb_dynids_lock); EXPORT_SYMBOL_GPL(usb_dynids_lock); static int __init usb_common_init(void) { usb_debug_root = debugfs_create_dir("usb", NULL); ledtrig_usb_init(); return 0; } static void __exit usb_common_exit(void) { ledtrig_usb_exit(); debugfs_remove_recursive(usb_debug_root); } subsys_initcall(usb_common_init); module_exit(usb_common_exit); MODULE_DESCRIPTION("Common code for host and device side USB"); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 6 6 6 6 1 5 5 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0 /* * Implement the manual drop-all-pagecache function */ #include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> #include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); /* * We must skip inodes in unusual state. We may also skip * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret) return ret; if (write) { static int stfu; if (sysctl_drop_caches & 1) { lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } if (sysctl_drop_caches & 2) { drop_slab(); count_vm_event(DROP_SLAB); } if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), sysctl_drop_caches); } stfu |= sysctl_drop_caches & 4; } return 0; } static const struct ctl_table drop_caches_table[] = { { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, }; static int __init init_vm_drop_caches_sysctls(void) { register_sysctl_init("vm", drop_caches_table); return 0; } fs_initcall(init_vm_drop_caches_sysctls);
2 2 2 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 /* * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * * Kendrick Smith <kmsmith@umich.edu> * Andy Adamson <andros@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/nfs4.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include "nfsd.h" #include "state.h" #include "netns.h" #include "trace.h" #include "xdr4cb.h" #include "xdr4.h" #include "nfs4xdr_gen.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 /* Index of predefined Linux callback client operations */ struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ u32 nops; __be32 *nops_p; u32 minorversion; /* res */ int status; }; static __be32 *xdr_encode_empty_array(__be32 *p) { *p++ = xdr_zero; return p; } /* * Encode/decode NFSv4 CB basic data types * * Basic NFSv4 callback data types are defined in section 15 of RFC * 3530: "Network File System (NFS) version 4 Protocol" and section * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version * 1 Protocol" */ static void encode_uint32(struct xdr_stream *xdr, u32 n) { WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); } static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, size_t len) { xdr_stream_encode_uint32_array(xdr, bitmap, len); } static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_cb_fattr *fattr) { fattr->ncf_cb_change = 0; fattr->ncf_cb_fsize = 0; fattr->ncf_cb_atime.tv_sec = 0; fattr->ncf_cb_atime.tv_nsec = 0; fattr->ncf_cb_mtime.tv_sec = 0; fattr->ncf_cb_mtime.tv_nsec = 0; if (bitmap[0] & FATTR4_WORD0_CHANGE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) return -EIO; if (bitmap[0] & FATTR4_WORD0_SIZE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) return -EIO; if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) { fattr4_time_deleg_access access; if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access)) return -EIO; fattr->ncf_cb_atime.tv_sec = access.seconds; fattr->ncf_cb_atime.tv_nsec = access.nseconds; } if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { fattr4_time_deleg_modify modify; if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify)) return -EIO; fattr->ncf_cb_mtime.tv_sec = modify.seconds; fattr->ncf_cb_mtime.tv_nsec = modify.nseconds; } return 0; } static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) { __be32 *p; p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(op); } /* * nfs_fh4 * * typedef opaque nfs_fh4<NFS4_FHSIZE>; */ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) { u32 length = fh->fh_size; __be32 *p; BUG_ON(length > NFS4_FHSIZE); p = xdr_reserve_space(xdr, 4 + length); xdr_encode_opaque(p, &fh->fh_raw, length); } /* * stateid4 * * struct stateid4 { * uint32_t seqid; * opaque other[12]; * }; */ static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) { __be32 *p; p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(sid->si_generation); xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE); } /* * sessionid4 * * typedef opaque sessionid4[NFS4_SESSIONID_SIZE]; */ static void encode_sessionid4(struct xdr_stream *xdr, const struct nfsd4_session *session) { __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); xdr_encode_opaque_fixed(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); } /* * nfsstat4 */ static const struct { int stat; int errno; } nfs_cb_errtbl[] = { { NFS4_OK, 0 }, { NFS4ERR_PERM, -EPERM }, { NFS4ERR_NOENT, -ENOENT }, { NFS4ERR_IO, -EIO }, { NFS4ERR_NXIO, -ENXIO }, { NFS4ERR_ACCESS, -EACCES }, { NFS4ERR_EXIST, -EEXIST }, { NFS4ERR_XDEV, -EXDEV }, { NFS4ERR_NOTDIR, -ENOTDIR }, { NFS4ERR_ISDIR, -EISDIR }, { NFS4ERR_INVAL, -EINVAL }, { NFS4ERR_FBIG, -EFBIG }, { NFS4ERR_NOSPC, -ENOSPC }, { NFS4ERR_ROFS, -EROFS }, { NFS4ERR_MLINK, -EMLINK }, { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, { NFS4ERR_DQUOT, -EDQUOT }, { NFS4ERR_STALE, -ESTALE }, { NFS4ERR_BADHANDLE, -EBADHANDLE }, { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, { NFS4ERR_RESOURCE, -EREMOTEIO }, { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, { -1, -EIO } }; /* * If we cannot translate the error, the recovery routines should * handle it. * * Note: remaining NFSv4 error codes have values > 10000, so should * not conflict with native Linux error codes. */ static int nfs_cb_stat_to_errno(int status) { int i; for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { if (nfs_cb_errtbl[i].stat == status) return nfs_cb_errtbl[i].errno; } dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status); return -status; } static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_cb_opnum4 expected, int *status) { __be32 *p; u32 op; p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(p == NULL)) goto out_overflow; op = be32_to_cpup(p++); if (unlikely(op != expected)) goto out_unexpected; *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: return -EIO; out_unexpected: dprintk("NFSD: Callback server returned operation %d but " "we issued a request for %d\n", op, expected); return -EIO; } /* * CB_COMPOUND4args * * struct CB_COMPOUND4args { * utf8str_cs tag; * uint32_t minorversion; * uint32_t callback_ident; * nfs_cb_argop4 argarray<>; * }; */ static void encode_cb_compound4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { __be32 * p; p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); p = xdr_encode_empty_array(p); /* empty tag */ *p++ = cpu_to_be32(hdr->minorversion); *p++ = cpu_to_be32(hdr->ident); hdr->nops_p = p; *p = cpu_to_be32(hdr->nops); /* argarray element count */ } /* * Update argarray element count */ static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) { BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS); *hdr->nops_p = cpu_to_be32(hdr->nops); } /* * CB_COMPOUND4res * * struct CB_COMPOUND4res { * nfsstat4 status; * utf8str_cs tag; * nfs_cb_resop4 resarray<>; * }; */ static int decode_cb_compound4res(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { u32 length; __be32 *p; p = xdr_inline_decode(xdr, XDR_UNIT); if (unlikely(p == NULL)) goto out_overflow; hdr->status = be32_to_cpup(p); /* Ignore the tag */ if (xdr_stream_decode_u32(xdr, &length) < 0) goto out_overflow; if (xdr_inline_decode(xdr, length) == NULL) goto out_overflow; if (xdr_stream_decode_u32(xdr, &hdr->nops) < 0) goto out_overflow; return 0; out_overflow: return -EIO; } /* * CB_RECALL4args * * struct CB_RECALL4args { * stateid4 stateid; * bool truncate; * nfs_fh4 fh; * }; */ static void encode_cb_recall4args(struct xdr_stream *xdr, const struct nfs4_delegation *dp, struct nfs4_cb_compound_hdr *hdr) { __be32 *p; encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); encode_stateid4(xdr, &dp->dl_stid.sc_stateid); p = xdr_reserve_space(xdr, 4); *p++ = xdr_zero; /* truncate */ encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle); hdr->nops++; } /* * CB_RECALLANY4args * * struct CB_RECALLANY4args { * uint32_t craa_objects_to_keep; * bitmap4 craa_type_mask; * }; */ static void encode_cb_recallany4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra) { encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY); encode_uint32(xdr, ra->ra_keep); encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval)); hdr->nops++; } /* * CB_GETATTR4args * struct CB_GETATTR4args { * nfs_fh4 fh; * bitmap4 attr_request; * }; * * The size and change attributes are the only one * guaranteed to be serviced by the client. */ static void encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, struct nfs4_cb_fattr *fattr) { struct nfs4_delegation *dp = container_of(fattr, struct nfs4_delegation, dl_cb_fattr); struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; u32 bmap_size = 1; u32 bmap[3]; bmap[0] = FATTR4_WORD0_SIZE; if (!ncf->ncf_file_modified) bmap[0] |= FATTR4_WORD0_CHANGE; if (deleg_attrs_deleg(dp->dl_type)) { bmap[1] = 0; bmap[2] = FATTR4_WORD2_TIME_DELEG_ACCESS | FATTR4_WORD2_TIME_DELEG_MODIFY; bmap_size = 3; } encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); encode_nfs_fh4(xdr, fh); encode_bitmap4(xdr, bmap, bmap_size); hdr->nops++; } static u32 highest_slotid(struct nfsd4_session *ses) { u32 idx; spin_lock(&ses->se_lock); idx = fls(~ses->se_cb_slot_avail); if (idx > 0) --idx; idx = max(idx, ses->se_cb_highest_slot); spin_unlock(&ses->se_lock); return idx; } static void encode_referring_call4(struct xdr_stream *xdr, const struct nfsd4_referring_call *rc) { encode_uint32(xdr, rc->rc_sequenceid); encode_uint32(xdr, rc->rc_slotid); } static void encode_referring_call_list4(struct xdr_stream *xdr, const struct nfsd4_referring_call_list *rcl) { struct nfsd4_referring_call *rc; __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data, NFS4_MAX_SESSIONID_LEN); encode_uint32(xdr, rcl->__nr_referring_calls); list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) encode_referring_call4(xdr, rc); } /* * CB_SEQUENCE4args * * struct CB_SEQUENCE4args { * sessionid4 csa_sessionid; * sequenceid4 csa_sequenceid; * slotid4 csa_slotid; * slotid4 csa_highest_slotid; * bool csa_cachethis; * referring_call_list4 csa_referring_call_lists<>; * }; */ static void encode_cb_sequence4args(struct xdr_stream *xdr, const struct nfsd4_callback *cb, struct nfs4_cb_compound_hdr *hdr) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; struct nfsd4_referring_call_list *rcl; __be32 *p; if (hdr->minorversion == 0) return; encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); encode_sessionid4(xdr, session); p = xdr_reserve_space(xdr, XDR_UNIT * 4); *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ /* csa_referring_call_lists */ encode_uint32(xdr, cb->cb_nr_referring_call_list); list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) encode_referring_call_list4(xdr, rcl); hdr->nops++; } static void update_cb_slot_table(struct nfsd4_session *ses, u32 target) { /* No need to do anything if nothing changed */ if (likely(target == READ_ONCE(ses->se_cb_highest_slot))) return; spin_lock(&ses->se_lock); if (target > ses->se_cb_highest_slot) { int i; target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1); /* * Growing the slot table. Reset any new sequences to 1. * * NB: There is some debate about whether the RFC requires this, * but the Linux client expects it. */ for (i = ses->se_cb_highest_slot + 1; i <= target; ++i) ses->se_cb_seq_nr[i] = 1; } ses->se_cb_highest_slot = target; spin_unlock(&ses->se_lock); } /* * CB_SEQUENCE4resok * * struct CB_SEQUENCE4resok { * sessionid4 csr_sessionid; * sequenceid4 csr_sequenceid; * slotid4 csr_slotid; * slotid4 csr_highest_slotid; * slotid4 csr_target_highest_slotid; * }; * * union CB_SEQUENCE4res switch (nfsstat4 csr_status) { * case NFS4_OK: * CB_SEQUENCE4resok csr_resok4; * default: * void; * }; * * Our current back channel implmentation supports a single backchannel * with a single slot. */ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_callback *cb) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; int status = -ESERVERFAULT; __be32 *p; u32 seqid, slotid, target; /* * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s Invalid session id\n", __func__); goto out; } p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); seqid = be32_to_cpup(p++); if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) { dprintk("NFS: %s Invalid sequence number\n", __func__); goto out; } slotid = be32_to_cpup(p++); if (slotid != cb->cb_held_slot) { dprintk("NFS: %s Invalid slotid\n", __func__); goto out; } p++; // ignore current highest slot value target = be32_to_cpup(p++); update_cb_slot_table(session, target); status = 0; out: cb->cb_seq_status = status; return status; out_overflow: status = -EIO; goto out; } static int decode_cb_sequence4res(struct xdr_stream *xdr, struct nfsd4_callback *cb) { int status; if (cb->cb_clp->cl_minorversion == 0) return 0; status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); if (unlikely(status || cb->cb_seq_status)) return status; return decode_cb_sequence4resok(xdr, cb); } /* * NFSv4.0 and NFSv4.1 XDR encode functions * * NFSv4.0 callback argument types are defined in section 15 of RFC * 3530: "Network File System (NFS) version 4 Protocol" and section 20 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 * Protocol". */ /* * NB: Without this zero space reservation, callbacks over krb5p fail */ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, const void *__unused) { xdr_reserve_space(xdr, 0); } /* * 20.1. Operation 3: CB_GETATTR - Get Attributes */ static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); encode_cb_getattr4args(xdr, &hdr, ncf); encode_cb_nops(&hdr); } /* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); encode_cb_recall4args(xdr, dp, &hdr); encode_cb_nops(&hdr); } /* * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects */ static void nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; struct nfsd4_cb_recall_any *ra; struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, .minorversion = cb->cb_clp->cl_minorversion, }; ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb); encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); encode_cb_recallany4args(xdr, &hdr, ra); encode_cb_nops(&hdr); } /* * NFSv4.0 and NFSv4.1 XDR decode functions * * NFSv4.0 callback result types are defined in section 15 of RFC * 3530: "Network File System (NFS) version 4 Protocol" and section 20 * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 * Protocol". */ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, void *__unused) { return 0; } /* * 20.1. Operation 3: CB_GETATTR - Get Attributes */ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; u32 bitmap[3] = {0}; u32 attrlen, maxlen; struct nfs4_cb_fattr *ncf = container_of(cb, struct nfs4_cb_fattr, ncf_getattr); status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) return -EIO; if (xdr_stream_decode_u32(xdr, &attrlen) < 0) return -EIO; maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize); if (bitmap[2] != 0) maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) + sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2; if (attrlen > maxlen) return -EIO; status = decode_cb_fattr4(xdr, bitmap, ncf); return status; } /* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } /* * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects */ static int nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status); return status; } #ifdef CONFIG_NFSD_PNFS /* * CB_LAYOUTRECALL4args * * struct layoutrecall_file4 { * nfs_fh4 lor_fh; * offset4 lor_offset; * length4 lor_length; * stateid4 lor_stateid; * }; * * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { * case LAYOUTRECALL4_FILE: * layoutrecall_file4 lor_layout; * case LAYOUTRECALL4_FSID: * fsid4 lor_fsid; * case LAYOUTRECALL4_ALL: * void; * }; * * struct CB_LAYOUTRECALL4args { * layouttype4 clora_type; * layoutiomode4 clora_iomode; * bool clora_changed; * layoutrecall4 clora_recall; * }; */ static void encode_cb_layout4args(struct xdr_stream *xdr, const struct nfs4_layout_stateid *ls, struct nfs4_cb_compound_hdr *hdr) { __be32 *p; BUG_ON(hdr->minorversion == 0); p = xdr_reserve_space(xdr, 5 * 4); *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); *p++ = cpu_to_be32(ls->ls_layout_type); *p++ = cpu_to_be32(IOMODE_ANY); *p++ = cpu_to_be32(1); *p = cpu_to_be32(RETURN_FILE); encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); p = xdr_reserve_space(xdr, 2 * 8); p = xdr_encode_hyper(p, 0); xdr_encode_hyper(p, NFS4_MAX_UINT64); encode_stateid4(xdr, &ls->ls_recall_sid); hdr->nops++; } static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; const struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); encode_cb_layout4args(xdr, ls, &hdr); encode_cb_nops(&hdr); } static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) { __be32 *p; p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); } static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; const struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner; struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, }; __be32 *p; BUG_ON(hdr.minorversion == 0); encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); encode_nfs_fh4(xdr, &nbl->nbl_fh); encode_stateowner(xdr, &lo->lo_owner); hdr.nops++; encode_cb_nops(&hdr); } static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } /* * struct write_response4 { * stateid4 wr_callback_id<1>; * length4 wr_count; * stable_how4 wr_committed; * verifier4 wr_writeverf; * }; * union offload_info4 switch (nfsstat4 coa_status) { * case NFS4_OK: * write_response4 coa_resok4; * default: * length4 coa_bytes_copied; * }; * struct CB_OFFLOAD4args { * nfs_fh4 coa_fh; * stateid4 coa_stateid; * offload_info4 coa_offload_info; * }; */ static void encode_offload_info4(struct xdr_stream *xdr, const struct nfsd4_cb_offload *cbo) { __be32 *p; p = xdr_reserve_space(xdr, 4); *p = cbo->co_nfserr; switch (cbo->co_nfserr) { case nfs_ok: p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); p = xdr_encode_empty_array(p); p = xdr_encode_hyper(p, cbo->co_res.wr_bytes_written); *p++ = cpu_to_be32(cbo->co_res.wr_stable_how); p = xdr_encode_opaque_fixed(p, cbo->co_res.wr_verifier.data, NFS4_VERIFIER_SIZE); break; default: p = xdr_reserve_space(xdr, 8); /* We always return success if bytes were written */ p = xdr_encode_hyper(p, 0); } } static void encode_cb_offload4args(struct xdr_stream *xdr, const struct nfsd4_cb_offload *cbo, struct nfs4_cb_compound_hdr *hdr) { __be32 *p; p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(OP_CB_OFFLOAD); encode_nfs_fh4(xdr, &cbo->co_fh); encode_stateid4(xdr, &cbo->co_res.cb_stateid); encode_offload_info4(xdr, cbo); hdr->nops++; } static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, struct xdr_stream *xdr, const void *data) { const struct nfsd4_callback *cb = data; const struct nfsd4_cb_offload *cbo = container_of(cb, struct nfsd4_cb_offload, co_cb); struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); encode_cb_offload4args(xdr, cbo, &hdr); encode_cb_nops(&hdr); } static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { struct nfsd4_callback *cb = data; struct nfs4_cb_compound_hdr hdr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) return status; status = decode_cb_sequence4res(xdr, cb); if (unlikely(status || cb->cb_seq_status)) return status; return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* * RPC procedure tables */ #define PROC(proc, call, argtype, restype) \ [NFSPROC4_CLNT_##proc] = { \ .p_proc = NFSPROC4_CB_##call, \ .p_encode = nfs4_xdr_enc_##argtype, \ .p_decode = nfs4_xdr_dec_##restype, \ .p_arglen = NFS4_enc_##argtype##_sz, \ .p_replen = NFS4_dec_##restype##_sz, \ .p_statidx = NFSPROC4_CLNT_##proc, \ .p_name = #proc, \ } static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NULL, NULL, cb_null, cb_null), PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), #ifdef CONFIG_NFSD_PNFS PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; #define NFS4_CB_PROGRAM 0x40000000 #define NFS4_CB_VERSION 1 struct nfsd_net_cb { struct rpc_version version4; const struct rpc_version *versions[NFS4_CB_VERSION + 1]; struct rpc_program program; struct rpc_stat stat; }; static int max_cb_time(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* * nfsd4_lease is set to at most one hour in __nfsd4_write_time, * so we can use 32-bit math on it. Warn if that assumption * ever stops being true. */ if (WARN_ON_ONCE(nn->nfsd4_lease > 3600)) return 360 * HZ; return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ; } static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; trace_nfsd_cb_queue(clp, cb); return queue_work(clp->cl_callback_wq, &cb->cb_work); } static void nfsd4_requeue_cb(struct rpc_task *task, struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { trace_nfsd_cb_restart(clp, cb); task->tk_status = 0; set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); } } static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) { atomic_inc(&clp->cl_cb_inflight); } static void nfsd41_cb_inflight_end(struct nfs4_client *clp) { atomic_dec_and_wake_up(&clp->cl_cb_inflight); } static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) { wait_var_event(&clp->cl_cb_inflight, !atomic_read(&clp->cl_cb_inflight)); } static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { client->cl_principal = clp->cl_cred.cr_targ_princ ? clp->cl_cred.cr_targ_princ : "nfs"; return get_cred(rpc_machine_cred()); } else { struct cred *kcred; kcred = prepare_kernel_cred(&init_task); if (!kcred) return NULL; kcred->fsuid = ses->se_cb_sec.uid; kcred->fsgid = ses->se_cb_sec.gid; return kcred; } } static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { .to_initval = maxtime, .to_retries = 0, .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, .address = (struct sockaddr *) &conn->cb_addr, .addrsize = conn->cb_addrlen, .saddress = (struct sockaddr *) &conn->cb_saddr, .timeout = &timeparms, .version = NFS4_CB_VERSION, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .cred = current_cred(), }; struct rpc_clnt *client; const struct cred *cred; args.program = &nn->nfsd_cb->program; if (clp->cl_minorversion == 0) { if (!clp->cl_cred.cr_principal && (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) { trace_nfsd_cb_setup_err(clp, -EINVAL); return -EINVAL; } args.client_name = clp->cl_cred.cr_principal; args.prognumber = conn->cb_prog; args.protocol = XPRT_TRANSPORT_TCP; args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { if (!conn->cb_xprt || !ses) return -EINVAL; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = conn->cb_xprt->xpt_class->xcl_ident | XPRT_TRANSPORT_BC; args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { trace_nfsd_cb_setup_err(clp, PTR_ERR(client)); return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); if (!cred) { trace_nfsd_cb_setup_err(clp, -ENOMEM); rpc_shutdown_client(client); return -ENOMEM; } if (clp->cl_minorversion != 0) clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_client = client; clp->cl_cb_cred = cred; rcu_read_lock(); trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID), args.authflavor); rcu_read_unlock(); return 0; } static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) { if (clp->cl_cb_state != newstate) { clp->cl_cb_state = newstate; trace_nfsd_cb_new_state(clp); } } static void nfsd4_mark_cb_down(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT); } static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) nfsd4_mark_cb_down(clp); else nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } static void nfsd4_cb_probe_release(void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); nfsd41_cb_inflight_end(clp); } static const struct rpc_call_ops nfsd4_cb_probe_ops = { /* XXX: release method to ensure we set the cb channel down if * necessary on early failure? */ .rpc_call_done = nfsd4_cb_probe_done, .rpc_release = nfsd4_cb_probe_release, }; /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. */ void nfsd4_probe_callback(struct nfs4_client *clp) { trace_nfsd_cb_probe(clp); nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); nfsd4_run_cb(&clp->cl_cb_null); } void nfsd4_probe_callback_sync(struct nfs4_client *clp) { nfsd4_probe_callback(clp); flush_workqueue(clp->cl_callback_wq); } void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) { nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); } static int grab_slot(struct nfsd4_session *ses) { int idx; spin_lock(&ses->se_lock); idx = ffs(ses->se_cb_slot_avail) - 1; if (idx < 0 || idx > ses->se_cb_highest_slot) { spin_unlock(&ses->se_lock); return -1; } /* clear the bit for the slot */ ses->se_cb_slot_avail &= ~BIT(idx); spin_unlock(&ses->se_lock); return idx; } /* * There's currently a single callback channel slot. * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_client *clp = cb->cb_clp; struct nfsd4_session *ses = clp->cl_cb_session; if (cb->cb_held_slot >= 0) return true; cb->cb_held_slot = grab_slot(ses); if (cb->cb_held_slot < 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); /* Race breaker */ cb->cb_held_slot = grab_slot(ses); if (cb->cb_held_slot < 0) return false; rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } return true; } static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; struct nfsd4_session *ses = clp->cl_cb_session; if (cb->cb_held_slot >= 0) { spin_lock(&ses->se_lock); ses->se_cb_slot_avail |= BIT(cb->cb_held_slot); spin_unlock(&ses->se_lock); cb->cb_held_slot = -1; rpc_wake_up_next(&clp->cl_cb_waitq); } } static void nfsd41_destroy_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); if (test_bit(NFSD4_CALLBACK_WAKE, &cb->cb_flags)) clear_and_wake_up_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); else clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); nfsd41_cb_inflight_end(clp); } /** * nfsd41_cb_referring_call - add a referring call to a callback operation * @cb: context of callback to add the rc to * @sessionid: referring call's session ID * @slotid: referring call's session slot index * @seqno: referring call's slot sequence number * * Caller serializes access to @cb. * * NB: If memory allocation fails, the referring call is not added. */ void nfsd41_cb_referring_call(struct nfsd4_callback *cb, struct nfs4_sessionid *sessionid, u32 slotid, u32 seqno) { struct nfsd4_referring_call_list *rcl; struct nfsd4_referring_call *rc; bool found; might_sleep(); found = false; list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) { if (!memcmp(rcl->rcl_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { found = true; break; } } if (!found) { rcl = kmalloc_obj(*rcl); if (!rcl) return; memcpy(rcl->rcl_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN); rcl->__nr_referring_calls = 0; INIT_LIST_HEAD(&rcl->rcl_referring_calls); list_add(&rcl->__list, &cb->cb_referring_call_list); cb->cb_nr_referring_call_list++; } found = false; list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) { if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) { found = true; break; } } if (!found) { rc = kmalloc_obj(*rc); if (!rc) goto out; rc->rc_sequenceid = seqno; rc->rc_slotid = slotid; rcl->__nr_referring_calls++; list_add(&rc->__list, &rcl->rcl_referring_calls); } out: if (!rcl->__nr_referring_calls) { cb->cb_nr_referring_call_list--; list_del(&rcl->__list); kfree(rcl); } } /** * nfsd41_cb_destroy_referring_call_list - release referring call info * @cb: context of a callback that has completed * * Callers who allocate referring calls using nfsd41_cb_referring_call() must * release those resources by calling nfsd41_cb_destroy_referring_call_list. * * Caller serializes access to @cb. */ void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb) { struct nfsd4_referring_call_list *rcl; struct nfsd4_referring_call *rc; while (!list_empty(&cb->cb_referring_call_list)) { rcl = list_first_entry(&cb->cb_referring_call_list, struct nfsd4_referring_call_list, __list); while (!list_empty(&rcl->rcl_referring_calls)) { rc = list_first_entry(&rcl->rcl_referring_calls, struct nfsd4_referring_call, __list); list_del(&rc->__list); kfree(rc); } list_del(&rcl->__list); kfree(rcl); } } static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; /* * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. */ trace_nfsd_cb_rpc_prepare(clp); cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion && !nfsd41_cb_get_slot(cb, task)) return; rpc_call_start(task); } /* Returns true if CB_COMPOUND processing should continue */ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; bool ret = false; if (cb->cb_held_slot < 0) goto requeue; /* This is the operation status code for CB_SEQUENCE */ trace_nfsd_cb_seq_status(task, cb); switch (cb->cb_seq_status) { case 0: /* * No need for lock, access serialized in nfsd4_cb_prepare * * RFC5661 20.9.3 * If CB_SEQUENCE returns an error, then the state of the slot * (sequence ID, cached reply) MUST NOT change. */ ++session->se_cb_seq_nr[cb->cb_held_slot]; ret = true; break; case -ESERVERFAULT: /* * Call succeeded, but the session, slot index, or slot * sequence number in the response do not match the same * in the server's call. The sequence information is thus * untrustworthy. */ nfsd4_mark_cb_fault(cb->cb_clp); break; case 1: /* * cb_seq_status remains 1 if an RPC Reply was never * received. NFSD can't know if the client processed * the CB_SEQUENCE operation. Ask the client to send a * DESTROY_SESSION to recover. */ fallthrough; case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp); goto requeue; case -NFS4ERR_DELAY: cb->cb_seq_status = 1; if (RPC_SIGNALLED(task) || !rpc_restart_call(task)) goto requeue; rpc_delay(task, 2 * HZ); return false; case -NFS4ERR_SEQ_MISORDERED: case -NFS4ERR_BADSLOT: /* * A SEQ_MISORDERED or BADSLOT error means that the client and * server are out of sync as to the backchannel parameters. Mark * the backchannel faulty and restart the RPC, but leak the slot * so that it's no longer used. */ nfsd4_mark_cb_fault(cb->cb_clp); cb->cb_held_slot = -1; goto retry_nowait; default: nfsd4_mark_cb_fault(cb->cb_clp); } trace_nfsd_cb_free_slot(task, cb); nfsd41_cb_release_slot(cb); return ret; retry_nowait: /* * RPC_SIGNALLED() means that the rpc_client is being torn down and * (possibly) recreated. Requeue the call in that case. */ if (!RPC_SIGNALLED(task)) { if (rpc_restart_call_prepare(task)) return false; } requeue: nfsd41_cb_release_slot(cb); nfsd4_requeue_cb(task, cb); return false; } static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; trace_nfsd_cb_rpc_done(clp); if (!clp->cl_minorversion) { /* * If the backchannel connection was shut down while this * task was queued, we need to resubmit it after setting up * a new backchannel connection. * * Note that if we lost our callback connection permanently * the submission code will error out, so we don't need to * handle that case here. */ if (RPC_SIGNALLED(task)) nfsd4_requeue_cb(task, cb); } else if (!nfsd4_cb_sequence_done(task, cb)) { return; } if (cb->cb_status) { WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d cb_opcode=%d", cb->cb_status, task->tk_status, cb->cb_ops->opcode); task->tk_status = cb->cb_status; } switch (cb->cb_ops->done(cb, task)) { case 0: task->tk_status = 0; rpc_restart_call_prepare(task); return; case 1: switch (task->tk_status) { case -EIO: case -ETIMEDOUT: case -EACCES: nfsd4_mark_cb_down(clp); } break; default: BUG(); } } static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; trace_nfsd_cb_rpc_release(cb->cb_clp); if (test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) nfsd4_queue_cb(cb); else nfsd41_destroy_cb(cb); } static const struct rpc_call_ops nfsd4_cb_ops = { .rpc_call_prepare = nfsd4_cb_prepare, .rpc_call_done = nfsd4_cb_done, .rpc_release = nfsd4_cb_release, }; /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { if (clp->cl_cb_state != NFSD4_CB_UNKNOWN) trace_nfsd_cb_shutdown(clp); set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; * instead, nfsd4_run_cb_null() will detect the killed * client, destroy the rpc client, and stop: */ nfsd4_run_cb(&clp->cl_cb_null); flush_workqueue(clp->cl_callback_wq); nfsd41_cb_inflight_wait_complete(clp); } static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { struct nfsd4_session *s; struct nfsd4_conn *c; lockdep_assert_held(&clp->cl_lock); list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_flags & NFS4_CDFC4_BACK) return c; } } return NULL; } /* * Note there isn't a lot of locking in this code; instead we depend on * the fact that it is run from clp->cl_callback_wq, which won't run two * work items at once. So, for example, clp->cl_callback_wq handles all * access of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. */ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; struct nfs4_client *clp = cb->cb_clp; struct nfsd4_session *ses = NULL; struct nfsd4_conn *c; int err; trace_nfsd_cb_bc_update(clp, cb); /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); clp->cl_cb_cred = NULL; } if (clp->cl_cb_conn.cb_xprt) { svc_xprt_put(clp->cl_cb_conn.cb_xprt); clp->cl_cb_conn.cb_xprt = NULL; } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { svc_xprt_get(c->cn_xprt); conn.cb_xprt = c->cn_xprt; ses = c->cn_session; } spin_unlock(&clp->cl_lock); err = setup_callback_client(clp, &conn, ses); if (err) { nfsd4_mark_cb_down(clp); if (c) svc_xprt_put(c->cn_xprt); return; } } static void nfsd4_run_cb_work(struct work_struct *work) { struct nfsd4_callback *cb = container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; int flags, ret; trace_nfsd_cb_start(clp); if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt || clp->cl_state == NFSD4_COURTESY) { /* * Callback channel broken, client killed or * nfs4_client in courtesy state; give up. */ nfsd41_destroy_cb(cb); return; } /* * Don't send probe messages for 4.1 or later. */ if (!cb->cb_ops && clp->cl_minorversion) { nfsd4_mark_cb_state(clp, NFSD4_CB_UP); nfsd41_destroy_cb(cb); return; } if (!test_and_clear_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) { if (cb->cb_ops && cb->cb_ops->prepare) cb->cb_ops->prepare(cb); } cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; ret = rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); if (ret != 0) { set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); nfsd4_queue_cb(cb); } } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; cb->cb_flags = 0; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_held_slot = -1; cb->cb_nr_referring_call_list = 0; INIT_LIST_HEAD(&cb->cb_referring_call_list); } /** * nfsd4_run_cb - queue up a callback job to run * @cb: callback to queue * * Kick off a callback to do its thing. Returns false if it was already * on a queue, true otherwise. */ bool nfsd4_run_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; bool queued; nfsd41_cb_inflight_begin(clp); queued = nfsd4_queue_cb(cb); if (!queued) nfsd41_cb_inflight_end(clp); return queued; } /** * nfsd_net_cb_shutdown - release per-netns callback RPC program resources * @nn: NFS server network namespace * * Frees resources allocated by nfsd_net_cb_init(). */ void nfsd_net_cb_shutdown(struct nfsd_net *nn) { struct nfsd_net_cb *cb = nn->nfsd_cb; if (cb) { kfree(cb->version4.counts); kfree(cb); nn->nfsd_cb = NULL; } } /** * nfsd_net_cb_init - initialize per-netns callback RPC program * @nn: NFS server network namespace * * Sets up the callback RPC program, version table, procedure * counts, and statistics structure for @nn. Caller must release * these resources using nfsd_net_cb_shutdown(). * * Return: 0 on success, or -ENOMEM if allocation fails. */ int nfsd_net_cb_init(struct nfsd_net *nn) { struct nfsd_net_cb *cb; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; cb->version4.counts = kzalloc_objs(unsigned int, ARRAY_SIZE(nfs4_cb_procedures), GFP_KERNEL); if (!cb->version4.counts) { kfree(cb); return -ENOMEM; } /* * Note on the callback rpc program version number: despite language * in rfc 5661 section 18.36.3 requiring servers to use 4 in this * field, the official xdr descriptions for both 4.0 and 4.1 specify * version 1, and in practice that appears to be what implementations * use. The section 18.36.3 language is expected to be fixed in an * erratum. */ cb->version4.number = NFS4_CB_VERSION; cb->version4.nrprocs = ARRAY_SIZE(nfs4_cb_procedures); cb->version4.procs = nfs4_cb_procedures; cb->versions[NFS4_CB_VERSION] = &cb->version4; cb->program.name = "nfs4_cb"; cb->program.number = NFS4_CB_PROGRAM; cb->program.nrvers = ARRAY_SIZE(cb->versions); cb->program.version = &cb->versions[0]; cb->program.pipe_dir_name = "nfsd4_cb"; cb->program.stats = &cb->stat; cb->stat.program = &cb->program; nn->nfsd_cb = cb; return 0; }
36 35 22 34 1 34 30 1 29 29 36 11 8 5 5 5 5 5 5 5 5 5 3 5 5 2 3 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/in6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->ipv6_v6only) { err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) goto error; udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck, u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); /** * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel * @skb: Packet for which lookup is done * @dev: Tunnel device * @net: Network namespace of tunnel device * @sock: Socket which provides route info * @oif: Index of the output interface * @saddr: Memory to store the src ip address * @key: Tunnel information * @sport: UDP source port * @dport: UDP destination port * @dsfield: The traffic class field * @dst_cache: The dst cache to use for lookup * This function performs a route lookup on a UDP tunnel * * It returns a valid dst pointer and stores src address to be used in * tunnel in param saddr on success, else a pointer encoded error code. */ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache) { struct dst_entry *dst = NULL; struct flowi6 fl6; #ifdef CONFIG_DST_CACHE if (dst_cache) { dst = dst_cache_get_ip6(dst_cache, saddr); if (dst) return dst; } #endif memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; fl6.flowi6_oif = oif; fl6.daddr = key->u.ipv6.dst; fl6.saddr = key->u.ipv6.src; fl6.fl6_sport = sport; fl6.fl6_dport = dport; fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label); dst = ip6_dst_lookup_flow(net, sock->sk, &fl6, NULL); if (IS_ERR(dst)) { netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } if (dst_dev(dst) == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); #endif *saddr = fl6.saddr; return dst; } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL");
1 1223 11397 995 6 23249 7 812 986 1183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/limits.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) #define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE) #endif #include <asm/thread_info.h> #ifndef TIF_NEED_RESCHED_LAZY #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY #error Inconsistent PREEMPT_LAZY #endif #define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif #ifndef TIF_RSEQ # define TIF_RSEQ TIF_NOTIFY_RESUME # define _TIF_RSEQ _TIF_NOTIFY_RESUME #endif #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } /* * This may be used in noinstr code, and needs to be __always_inline to prevent * inadvertent instrumentation. */ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti) { return READ_ONCE(ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define read_thread_flags() \ read_ti_thread_flags(current_thread_info()) #define read_task_thread_flags(t) \ read_ti_thread_flags(task_thread_info(t)) #ifdef CONFIG_GENERIC_ENTRY #define set_syscall_work(fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define test_syscall_work(fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define clear_syscall_work(fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define set_task_syscall_work(t, fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define test_task_syscall_work(t, fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define clear_task_syscall_work(t, fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #else /* CONFIG_GENERIC_ENTRY */ #define set_syscall_work(fl) \ set_ti_thread_flag(current_thread_info(), TIF_##fl) #define test_syscall_work(fl) \ test_ti_thread_flag(current_thread_info(), TIF_##fl) #define clear_syscall_work(fl) \ clear_ti_thread_flag(current_thread_info(), TIF_##fl) #define set_task_syscall_work(t, fl) \ set_ti_thread_flag(task_thread_info(t), TIF_##fl) #define test_task_syscall_work(t, fl) \ test_ti_thread_flag(task_thread_info(t), TIF_##fl) #define clear_task_syscall_work(t, fl) \ clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H static __always_inline bool tif_test_bit(int bit) { return arch_test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #else static __always_inline bool tif_test_bit(int bit) { return test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ static __always_inline bool tif_need_resched(void) { return tif_test_bit(TIF_NEED_RESCHED); } #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif void arch_task_cache_init(void); /* for CONFIG_SH */ void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
4 4 4 4 4 2 477 482 3 1 2 2 1 21 10 5 12 3 12 14 14 15 15 3 6 5 6 6 5 5 2 1 1 5 5 6 6 3 3 3 2 2 2 2 2 2 3 4 18 5 8 6 3 2 2 2 2 2 15 15 3 18 43 1 43 15 43 16 16 16 16 16 16 15 3 3 3 3 3 14 2 2 14 14 14 1 14 12 11 2 11 2 2 13 37 37 37 37 27 11 36 36 36 35 35 14 14 6 6 5 5 5 5 5 5 33 33 33 33 3 3 33 31 33 33 30 30 1 29 3 37 37 37 33 31 37 4 4 4 4 4 4 39 39 39 39 3 30 38 39 18 20 7 18 17 17 17 17 6 6 4 13 4 4 2 1 16 16 16 5 5 4 3 3 3 3 3 3 13 4 3 2 1 1 2 13 13 13 13 17 4 17 17 17 20 21 21 17 21 21 21 19 19 5 25 24 24 1 26 1 1 26 26 26 2 2 2 2 2 1 2 2 26 5 21 13 13 4 4 9 9 9 14 14 14 14 14 14 14 14 14 14 14 9 5 9 5 5 5 5 4 2 3 2 4 5 5 5 5 5 1 5 2 2 4 5 4 5 3 5 3 3 3 2 2 1 2 4 4 4 149 147 14 8 3 39 26 40 5 4 1 1 1 2 2 1 1 1 1 1 1 2 2 1 5 5 22 22 130 155 1 22 22 20 4 35 35 16 19 33 16 145 35 17 131 131 31 127 132 11 133 97 20 80 79 51 48 41 160 23 107 158 157 133 23 99 137 27 155 25 155 149 154 154 155 2 153 154 150 13 139 129 154 151 150 152 151 22 125 128 152 3 3 3 3 2 2 2 2 2 3 3 3 6 6 5 6 1 5 1 4 1 3 4 5 6 6 2 2 9 1 5 7 5 5 1 5 5 11 6 1 5 2 1 11 4 9 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_context.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" #define __MADV_SET_ANON_VMA_NAME (-1) /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; enum madvise_lock_mode { MADVISE_NO_LOCK, MADVISE_MMAP_READ_LOCK, MADVISE_MMAP_WRITE_LOCK, MADVISE_VMA_READ_LOCK, }; struct madvise_behavior_range { unsigned long start; unsigned long end; }; struct madvise_behavior { struct mm_struct *mm; int behavior; struct mmu_gather *tlb; enum madvise_lock_mode lock_mode; struct anon_vma_name *anon_name; /* * The range over which the behaviour is currently being applied. If * traversing multiple VMAs, this is updated for each. */ struct madvise_behavior_range range; /* The VMA and VMA preceding it (if applicable) currently targeted. */ struct vm_area_struct *prev; struct vm_area_struct *vma; bool lock_dropped; }; #ifdef CONFIG_ANON_VMA_NAME static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc_flex(*anon_name, name, count); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { vma_assert_stabilised(vma); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags or anon_name on region of a vma, splitting it or merging * it as necessary. Must be called with mmap_lock held for writing. */ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; vma_flags_t new_vma_flags = legacy_to_vma_flags(new_flags); struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); if (vma_flags_same_mask(&vma->flags, new_vma_flags) && (!set_new_anon_name || anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) vma = vma_modify_name(&vmi, madv_behavior->prev, vma, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, range->start, range->end, &new_vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); madv_behavior->vma = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vma->flags = new_vma_flags; if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; softleaf_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); entry = softleaf_from_pte(pte); if (unlikely(!softleaf_is_swap(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (!softleaf_is_swap(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) { VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); madv_behavior->lock_dropped = true; } /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct mm_struct *mm = madv_behavior->mm; struct file *file = vma->vm_file; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; loff_t offset; #ifdef CONFIG_SWAP if (!file) { walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ mark_mmap_lock_dropped(madv_behavior); get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t *ptentp) { int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, FPB_MERGE_YOUNG_DIRTY); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !pmd_is_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; flush_tlb_batched_pending(mm); lazy_mmu_mode_enable(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct mmu_gather tlb; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, madv_behavior->mm); madvise_cold_page_range(&tlb, madv_behavior); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct madvise_behavior_range *range) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct madvise_behavior *madv_behavior) { struct mmu_gather tlb; struct vm_area_struct *vma = madv_behavior->vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, madv_behavior->mm); madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { softleaf_t entry = softleaf_from_pte(ptent); if (softleaf_is_swap(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; swap_put_entries_direct(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (softleaf_is_hwpoison(entry) || softleaf_is_poison_marker(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; flush_tlb_batched_pending(mm); lazy_mmu_mode_enable(); if (!err) nr = 0; continue; } } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) { switch (mode) { case MADVISE_VMA_READ_LOCK: return PGWALK_VMA_RDLOCK_VERIFY; case MADVISE_MMAP_READ_LOCK: return PGWALK_RDLOCK; default: /* Other modes don't require fixing up the walk_lock */ WARN_ON_ONCE(1); return PGWALK_RDLOCK; } } static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; struct mmu_notifier_range range = { .start = madv_behavior->range.start, .end = madv_behavior->range.end, }; struct mmu_gather *tlb = madv_behavior->tlb; struct mm_walk_ops walk_ops = { .pmd_entry = madvise_free_pte_range, }; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(tlb, vma); walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); walk_page_range_vma(vma, range.start, range.end, &walk_ops, tlb); tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_vma_range call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) { struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, }; zap_vma_range_batched(madv_behavior->tlb, madv_behavior->vma, range->start, range->end - range->start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; int behavior = madv_behavior->behavior; struct madvise_behavior_range *range = &madv_behavior->range; if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (range->start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct madvise_behavior_range *range = &madv_behavior->range; int behavior = madv_behavior->behavior; if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; if (range->start == range->end) return 0; if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { struct vm_area_struct *vma; mark_mmap_lock_dropped(madv_behavior); mmap_read_lock(mm); madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; if (range->end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ range->end = vma->vm_end; } /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ if (range->start == range->end) return 0; VM_WARN_ON(range->start > range->end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(madv_behavior); else if (behavior == MADV_FREE) return madvise_free_single_vma(madv_behavior); else return -EINVAL; } static long madvise_populate(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; int locked = 1; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct madvise_behavior *madv_behavior) { loff_t offset; int error; struct file *f; struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; mark_mmap_lock_dropped(madv_behavior); if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; return !(vma->vm_flags & disallowed); } static bool is_guard_pte_marker(pte_t ptent) { const softleaf_t entry = softleaf_from_pte(ptent); return softleaf_is_guard_marker(entry); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static long madvise_guard_install(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct mm_walk_ops walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = get_walk_lock(madv_behavior->lock_mode), }; long err; int i; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * Set atomically under read lock. All pertinent readers will need to * acquire an mmap/VMA write lock to read it. All remaining readers may * or may not see the flag set, but we don't care. */ vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT); /* * If anonymous and we are establishing page tables the VMA ought to * have an anon_vma associated with it. * * We will hold an mmap read lock if this is necessary, this is checked * as part of the VMA lock logic. */ if (vma_is_anonymous(vma)) { VM_WARN_ON_ONCE(!vma->anon_vma && madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK); err = anon_vma_prepare(vma); if (err) return err; } /* * Optimistically try to install the guard marker pages first. If any * non-guard pages or THP huge pages are encountered, give up and zap * the range before trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. * * This would only happen due to races with e.g. page faults or * khugepaged. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) err = walk_page_range_vma_unsafe(madv_behavior->vma, range->start, range->end, &walk_ops, &nr_pages); else err = walk_page_range_mm_unsafe(vma->vm_mm, range->start, range->end, &walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(range->end - range->start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_vma_range(vma, range->start, range->end - range->start); } /* * We were unable to install the guard pages, return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct mm_walk_ops wallk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = get_walk_lock(madv_behavior->lock_mode), }; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range_vma(vma, range->start, range->end, &wallk_ops, NULL); } #ifdef CONFIG_64BIT /* Does the madvise operation result in discarding of mapped data? */ static bool is_discard(int behavior) { switch (behavior) { case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: case MADV_GUARD_INSTALL: return true; } return false; } /* * We are restricted from madvise()'ing mseal()'d VMAs only in very particular * circumstances - discarding of data from read-only anonymous SEALED mappings. * * This is because users cannot trivally discard data from these VMAs, and may * only do so via an appropriate madvise() call. */ static bool can_madvise_modify(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; /* If the VMA isn't sealed we're good. */ if (!vma_is_sealed(vma)) return true; /* For a sealed VMA, we only care about discard operations. */ if (!is_discard(madv_behavior->behavior)) return true; /* * We explicitly permit all file-backed mappings, whether MAP_SHARED or * MAP_PRIVATE. * * The latter causes some complications. Because now, one can mmap() * read/write a MAP_PRIVATE mapping, write to it, then mprotect() * read-only, mseal() and a discard will be permitted. * * However, in order to avoid issues with potential use of madvise(..., * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, * permit this. */ if (!vma_is_anonymous(vma)) return true; /* If the user could write to the mapping anyway, then this is fine. */ if ((vma->vm_flags & VM_WRITE) && arch_vma_access_permitted(vma, /* write= */ true, /* execute= */ false, /* foreign= */ false)) return true; /* Otherwise, we are not permitted to perform this operation. */ return false; } #else static bool can_madvise_modify(struct madvise_behavior *madv_behavior) { return true; } #endif /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) { int behavior = madv_behavior->behavior; struct vm_area_struct *vma = madv_behavior->vma; vm_flags_t new_flags = vma->vm_flags; struct madvise_behavior_range *range = &madv_behavior->range; int error; if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(madv_behavior); case MADV_WILLNEED: return madvise_willneed(madv_behavior); case MADV_COLD: return madvise_cold(madv_behavior); case MADV_PAGEOUT: return madvise_pageout(madv_behavior); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(madv_behavior); case MADV_COLLAPSE: return madvise_collapse(vma, range->start, range->end, &madv_behavior->lock_dropped); case MADV_GUARD_INSTALL: return madvise_guard_install(madv_behavior); case MADV_GUARD_REMOVE: return madvise_guard_remove(madv_behavior); /* The below behaviours update VMAs via madvise_update_vma(). */ case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (new_flags & VM_SPECIAL) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || new_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (new_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || (new_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, range->start, range->end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case __MADV_SET_ANON_VMA_NAME: /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; break; } /* This is a write operation.*/ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); error = madvise_update_vma(new_flags, madv_behavior); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(struct madvise_behavior *madv_behavior) { unsigned long size; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } static bool is_memory_failure(struct madvise_behavior *madv_behavior) { switch (madv_behavior->behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: return true; default: return false; } } #else static int madvise_inject_error(struct madvise_behavior *madv_behavior) { return 0; } static bool is_memory_failure(struct madvise_behavior *madv_behavior) { return false; } #endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* Does this operation invoke anon_vma_prepare()? */ static bool prepares_anon_vma(int behavior) { switch (behavior) { case MADV_GUARD_INSTALL: return true; default: return false; } } /* * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA * read lock only now we have a VMA to examine? */ static bool is_vma_lock_sufficient(struct vm_area_struct *vma, struct madvise_behavior *madv_behavior) { /* Must span only a single VMA.*/ if (madv_behavior->range.end > vma->vm_end) return false; /* Remote processes unsupported. */ if (current->mm != vma->vm_mm) return false; /* Userfaultfd unsupported. */ if (userfaultfd_armed(vma)) return false; /* * anon_vma_prepare() explicitly requires an mmap lock for * serialisation, so we cannot use a VMA lock in this case. * * Note we might race with anon_vma being set, however this makes this * check overly paranoid which is safe. */ if (vma_is_anonymous(vma) && prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma) return false; return true; } /* * Try to acquire a VMA read lock if possible. * * We only support this lock over a single VMA, which the input range must * span either partially or fully. * * This function always returns with an appropriate lock held. If a VMA read * lock could be acquired, we return true and set madv_behavior state * accordingly. * * If a VMA read lock could not be acquired, we return false and expect caller to * fallback to mmap lock behaviour. */ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma; vma = lock_vma_under_rcu(mm, madv_behavior->range.start); if (!vma) goto take_mmap_read_lock; if (!is_vma_lock_sufficient(vma, madv_behavior)) { vma_end_read(vma); goto take_mmap_read_lock; } madv_behavior->vma = vma; return true; take_mmap_read_lock: mmap_read_lock(mm); madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; return false; } /* * Walk the vmas in range [start,end), and call the madvise_vma_behavior * function on each one. The function will get start and end parameters that * cover the overlap between the current vma and the original range. Any * unmapped regions in the original range will result in this function returning * -ENOMEM while still calling the madvise_vma_behavior function on all of the * existing vmas in the range. Must be called with the mmap_lock held for * reading or writing. */ static int madvise_walk_vmas(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct madvise_behavior_range *range = &madv_behavior->range; /* range is updated to span each VMA, so store end of entire range. */ unsigned long last_end = range->end; int unmapped_error = 0; int error; struct vm_area_struct *prev, *vma; /* * If VMA read lock is supported, apply madvise to a single VMA * tentatively, avoiding walking VMAs. */ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && try_vma_read_lock(madv_behavior)) { error = madvise_vma_behavior(madv_behavior); vma_end_read(madv_behavior->vma); return error; } vma = find_vma_prev(mm, range->start, &prev); if (vma && range->start > vma->vm_start) prev = vma; for (;;) { /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (last_end|vma->vm_end). */ if (range->start < vma->vm_start) { /* * This indicates a gap between VMAs in the input * range. This does not cause the operation to abort, * rather we simply return -ENOMEM to indicate that this * has happened, but carry on. */ unmapped_error = -ENOMEM; range->start = vma->vm_start; if (range->start >= last_end) break; } /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ range->end = min(vma->vm_end, last_end); /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ madv_behavior->prev = prev; madv_behavior->vma = vma; error = madvise_vma_behavior(madv_behavior); if (error) return error; if (madv_behavior->lock_dropped) { /* We dropped the mmap lock, we can't ref the VMA. */ prev = NULL; vma = NULL; madv_behavior->lock_dropped = false; } else { vma = madv_behavior->vma; prev = vma; } if (vma && range->end < vma->vm_end) range->end = vma->vm_end; if (range->end >= last_end) break; vma = find_vma(mm, vma ? vma->vm_end : range->end); range->start = range->end; } return unmapped_error; } /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) { if (is_memory_failure(madv_behavior)) return MADVISE_NO_LOCK; switch (madv_behavior->behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: return MADVISE_MMAP_READ_LOCK; case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return MADVISE_VMA_READ_LOCK; default: return MADVISE_MMAP_WRITE_LOCK; } } static int madvise_lock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); switch (lock_mode) { case MADVISE_NO_LOCK: break; case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; break; case MADVISE_MMAP_READ_LOCK: mmap_read_lock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ break; } madv_behavior->lock_mode = lock_mode; return 0; } static void madvise_unlock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; switch (madv_behavior->lock_mode) { case MADVISE_NO_LOCK: return; case MADVISE_MMAP_WRITE_LOCK: mmap_write_unlock(mm); break; case MADVISE_MMAP_READ_LOCK: mmap_read_unlock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will drop the lock per-VMA in madvise_walk_vmas(). */ break; } madv_behavior->lock_mode = MADVISE_NO_LOCK; } static bool madvise_batch_tlb_flush(int behavior) { switch (behavior) { case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return true; default: return false; } } static void madvise_init_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); } static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_finish_mmu(madv_behavior->tlb); } static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; if (!madvise_behavior_valid(behavior)) return false; if (!PAGE_ALIGNED(start)) return false; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return false; if (start + len < start) return false; return true; } /* * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. * @behavior: Requested madvise behavior. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the * operation. This function returns true in the cases, otherwise false. In * the former case we store an error on @err. */ static bool madvise_should_skip(unsigned long start, size_t len_in, int behavior, int *err) { if (!is_valid_madvise(start, len_in, behavior)) { *err = -EINVAL; return true; } if (start + PAGE_ALIGN(len_in) == start) { *err = 0; return true; } return false; } static bool is_madvise_populate(struct madvise_behavior *madv_behavior) { switch (madv_behavior->behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return true; default: return false; } } /* * untagged_addr_remote() assumes mmap_lock is already held. On * architectures like x86 and RISC-V, tagging is tricky because each * mm may have a different tagging mask. However, we might only hold * the per-VMA lock (currently only local processes are supported), * so untagged_addr is used to avoid the mmap_lock assertion for * local processes. */ static inline unsigned long get_untagged_addr(struct mm_struct *mm, unsigned long start) { return current->mm == mm ? untagged_addr(start) : untagged_addr_remote(mm, start); } static int madvise_do_behavior(unsigned long start, size_t len_in, struct madvise_behavior *madv_behavior) { struct blk_plug plug; int error; struct madvise_behavior_range *range = &madv_behavior->range; if (is_memory_failure(madv_behavior)) { range->start = start; range->end = start + len_in; return madvise_inject_error(madv_behavior); } range->start = get_untagged_addr(madv_behavior->mm, start); range->end = range->start + PAGE_ALIGN(len_in); blk_start_plug(&plug); if (is_madvise_populate(madv_behavior)) error = madvise_populate(madv_behavior); else error = madvise_walk_vmas(madv_behavior); blk_finish_plug(&plug); return error; } /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = behavior, .tlb = &tlb, }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; error = madvise_lock(&madv_behavior); if (error) return error; madvise_init_tlb(&madv_behavior); error = madvise_do_behavior(start, len_in, &madv_behavior); madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = behavior, .tlb = &tlb, }; total_len = iov_iter_count(iter); ret = madvise_lock(&madv_behavior); if (ret) return ret; madvise_init_tlb(&madv_behavior); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); size_t len_in = iter_iov_len(iter); int error; if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else ret = madvise_do_behavior(start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); ret = madvise_lock(&madv_behavior); if (ret) goto out; madvise_init_tlb(&madv_behavior); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; } #ifdef CONFIG_ANON_VMA_NAME #define ANON_VMA_NAME_MAX_LEN 80 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" static inline bool is_valid_name_char(char ch) { /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ return ch > 0x1f && ch < 0x7f && !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); } static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; int error; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = __MADV_SET_ANON_VMA_NAME, .anon_name = anon_name, }; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; madv_behavior.range.start = start; madv_behavior.range.end = end; error = madvise_lock(&madv_behavior); if (error) return error; error = madvise_walk_vmas(&madv_behavior); madvise_unlock(&madv_behavior); return error; } int set_anon_vma_name(unsigned long addr, unsigned long size, const char __user *uname) { struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = current->mm; int error; if (uname) { char *name, *pch; name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); for (pch = name; *pch != '\0'; pch++) { if (!is_valid_name_char(*pch)) { kfree(name); return -EINVAL; } } /* anon_vma has its own copy */ anon_name = anon_vma_name_alloc(name); kfree(name); if (!anon_name) return -ENOMEM; } error = madvise_set_anon_name(mm, addr, size, anon_name); anon_vma_name_put(anon_name); return error; } #endif
66 1 65 5 66 61 58 4 1 3 2 59 4 4 4 1 4 4 3 4 4 4 4 1 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/termios.h> #include <linux/tty.h> #include <linux/export.h> #include "tty.h" /* * Routine which returns the baud rate of the tty * * Note that the baud_table needs to be kept in sync with the * include/asm/termbits.h file. */ static const speed_t baud_table[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, 230400, 460800, #ifdef __sparc__ 76800, 153600, 307200, 614400, 921600, 500000, 576000, 1000000, 1152000, 1500000, 2000000 #else 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000 #endif }; static const tcflag_t baud_bits[] = { B0, B50, B75, B110, B134, B150, B200, B300, B600, B1200, B1800, B2400, B4800, B9600, B19200, B38400, B57600, B115200, B230400, B460800, #ifdef __sparc__ B76800, B153600, B307200, B614400, B921600, B500000, B576000, B1000000, B1152000, B1500000, B2000000 #else B500000, B576000, B921600, B1000000, B1152000, B1500000, B2000000, B2500000, B3000000, B3500000, B4000000 #endif }; static int n_baud_table = ARRAY_SIZE(baud_table); /** * tty_termios_baud_rate * @termios: termios structure * * Convert termios baud rate data into a speed. This should be called * with the termios lock held if this termios is a terminal termios * structure. Device drivers can call this function but should use * ->c_[io]speed directly as they are updated. * * Locking: none */ speed_t tty_termios_baud_rate(const struct ktermios *termios) { unsigned int cbaud; cbaud = termios->c_cflag & CBAUD; /* Magic token for arbitrary speed via c_ispeed/c_ospeed */ if (cbaud == BOTHER) return termios->c_ospeed; if (cbaud & CBAUDEX) { cbaud &= ~CBAUDEX; cbaud += 15; } return cbaud >= n_baud_table ? 0 : baud_table[cbaud]; } EXPORT_SYMBOL(tty_termios_baud_rate); /** * tty_termios_input_baud_rate * @termios: termios structure * * Convert termios baud rate data into a speed. This should be called * with the termios lock held if this termios is a terminal termios * structure. Device drivers can call this function but should use * ->c_[io]speed directly as they are updated. * * Locking: none */ speed_t tty_termios_input_baud_rate(const struct ktermios *termios) { unsigned int cbaud = (termios->c_cflag >> IBSHIFT) & CBAUD; if (cbaud == B0) return tty_termios_baud_rate(termios); /* Magic token for arbitrary speed via c_ispeed */ if (cbaud == BOTHER) return termios->c_ispeed; if (cbaud & CBAUDEX) { cbaud &= ~CBAUDEX; cbaud += 15; } return cbaud >= n_baud_table ? 0 : baud_table[cbaud]; } EXPORT_SYMBOL(tty_termios_input_baud_rate); /** * tty_termios_encode_baud_rate * @termios: ktermios structure holding user requested state * @ibaud: input speed * @obaud: output speed * * Encode the speeds set into the passed termios structure. This is * used as a library helper for drivers so that they can report back * the actual speed selected when it differs from the speed requested * * For maximal back compatibility with legacy SYS5/POSIX *nix behaviour * we need to carefully set the bits when the user does not get the * desired speed. We allow small margins and preserve as much of possible * of the input intent to keep compatibility. * * Locking: Caller should hold termios lock. This is already held * when calling this function from the driver termios handler. * * The ifdefs deal with platforms whose owners have yet to update them * and will all go away once this is done. */ void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud) { int i = 0; int ifound = -1, ofound = -1; int iclose = ibaud/50, oclose = obaud/50; int ibinput = 0; if (obaud == 0) /* CD dropped */ ibaud = 0; /* Clear ibaud to be sure */ termios->c_ispeed = ibaud; termios->c_ospeed = obaud; if (((termios->c_cflag >> IBSHIFT) & CBAUD) != B0) ibinput = 1; /* An input speed was specified */ /* If the user asked for a precise weird speed give a precise weird * answer. If they asked for a Bfoo speed they may have problems * digesting non-exact replies so fuzz a bit. */ if ((termios->c_cflag & CBAUD) == BOTHER) { oclose = 0; if (!ibinput) iclose = 0; } if (((termios->c_cflag >> IBSHIFT) & CBAUD) == BOTHER) iclose = 0; termios->c_cflag &= ~CBAUD; termios->c_cflag &= ~(CBAUD << IBSHIFT); /* * Our goal is to find a close match to the standard baud rate * returned. Walk the baud rate table and if we get a very close * match then report back the speed as a POSIX Bxxxx value by * preference */ do { if (obaud - oclose <= baud_table[i] && obaud + oclose >= baud_table[i]) { termios->c_cflag |= baud_bits[i]; ofound = i; } if (ibaud - iclose <= baud_table[i] && ibaud + iclose >= baud_table[i]) { /* For the case input == output don't set IBAUD bits * if the user didn't do so. */ if (ofound == i && !ibinput) { ifound = i; } else { ifound = i; termios->c_cflag |= (baud_bits[i] << IBSHIFT); } } } while (++i < n_baud_table); /* If we found no match then use BOTHER. */ if (ofound == -1) termios->c_cflag |= BOTHER; /* Set exact input bits only if the input and output differ or the * user already did. */ if (ifound == -1 && (ibaud != obaud || ibinput)) termios->c_cflag |= (BOTHER << IBSHIFT); } EXPORT_SYMBOL_GPL(tty_termios_encode_baud_rate); /** * tty_encode_baud_rate - set baud rate of the tty * @tty: terminal device * @ibaud: input baud rate * @obaud: output baud rate * * Update the current termios data for the tty with the new speed * settings. The caller must hold the termios_rwsem for the tty in * question. */ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud) { tty_termios_encode_baud_rate(&tty->termios, ibaud, obaud); } EXPORT_SYMBOL_GPL(tty_encode_baud_rate);
18 20 20 4 4 4 4 4 2 2 1 8 8 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 Anders K. Pedersen <akp@cohaesio.com> */ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/dst.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_rt { enum nft_rt_keys key:8; u8 dreg; }; static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) { u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst); const struct sk_buff *skb = pkt->skb; struct dst_entry *dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(fl)); switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ip_hdr(skb)->saddr; minlen = sizeof(struct iphdr) + sizeof(struct tcphdr); break; case NFPROTO_IPV6: fl.u.ip6.daddr = ipv6_hdr(skb)->saddr; minlen = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); break; } nf_route(nft_net(pkt), &dst, &fl, false, nft_pf(pkt)); if (dst) { mtu = min(mtu, dst_mtu(dst)); dst_release(dst); } if (mtu <= minlen || mtu > 0xffff) return TCP_MSS_DEFAULT; return mtu - minlen; } void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_rt *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; u32 *dest = &regs->data[priv->dreg]; const struct dst_entry *dst; dst = skb_dst(skb); if (!dst) goto err; switch (priv->key) { #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_RT_CLASSID: *dest = dst->tclassid; break; #endif case NFT_RT_NEXTHOP4: if (nft_pf(pkt) != NFPROTO_IPV4) goto err; *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; case NFT_RT_TCPMSS: nft_reg_store16(dest, get_tcpmss(pkt, dst)); break; #ifdef CONFIG_XFRM case NFT_RT_XFRM: nft_reg_store8(dest, !!dst->xfrm); break; #endif default: WARN_ON(1); goto err; } return; err: regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = NLA_POLICY_MAX(NLA_BE32, NFT_REG32_MAX), [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_rt_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_rt *priv = nft_expr_priv(expr); unsigned int len; if (tb[NFTA_RT_KEY] == NULL || tb[NFTA_RT_DREG] == NULL) return -EINVAL; priv->key = ntohl(nla_get_be32(tb[NFTA_RT_KEY])); switch (priv->key) { #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_RT_CLASSID: #endif case NFT_RT_NEXTHOP4: len = sizeof(u32); break; case NFT_RT_NEXTHOP6: len = sizeof(struct in6_addr); break; case NFT_RT_TCPMSS: len = sizeof(u16); break; #ifdef CONFIG_XFRM case NFT_RT_XFRM: len = sizeof(u8); break; #endif default: return -EOPNOTSUPP; } return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } static int nft_rt_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_rt *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_RT_KEY, htonl(priv->key))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_RT_DREG, priv->dreg)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->key) { case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: case NFT_RT_CLASSID: case NFT_RT_XFRM: return 0; case NFT_RT_TCPMSS: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static const struct nft_expr_ops nft_rt_get_ops = { .type = &nft_rt_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)), .eval = nft_rt_get_eval, .init = nft_rt_get_init, .dump = nft_rt_get_dump, .validate = nft_rt_validate, }; struct nft_expr_type nft_rt_type __read_mostly = { .name = "rt", .ops = &nft_rt_get_ops, .policy = nft_rt_policy, .maxattr = NFTA_RT_MAX, .owner = THIS_MODULE, };
1 1 1 38 36 37 37 1 1 1 1 1 1 1 1 2 2 2 21 22 17 20 22 22 22 22 4 3 3 19 19 13 11 18 17 7 3 3 3 7 7 11 1 1 1 1 10 9 12 8 1 22 4 4 4 4 38 38 38 36 38 6 6 6 6 6 6 6 6 4 4 4 3 6 6 6 3 1 3 3 3 1 1 3 3 3 3 3 2 3 3 9 7 7 7 9 2 2 2 2 2 1 1 1 1 1 1 3 2 3 3 3 3 3 1 3 3 1 3 3 1 3 3 3 3 806 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "mesh-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the mesh interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_meshif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * mesh interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *mesh_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(mesh_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(mesh_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: case BATADV_FORW_MCAST: do_bcast = false; break; case BATADV_FORW_NONE: fallthrough; default: goto dropped; } } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else if (forw_mode == BATADV_FORW_MCAST) { ret = batadv_mcast_forw_mcsend(bat_priv, skb); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @mesh_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @mesh_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @mesh_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *mesh_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, mesh_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_meshif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_meshif_vlan_release(struct kref *ref) { struct batadv_meshif_vlan *vlan; vlan = container_of(ref, struct batadv_meshif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->meshif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->meshif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_meshif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the mesh interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_meshif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->meshif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_meshif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_meshif_vlan *vlan; spin_lock_bh(&bat_priv->meshif_vlan_list_lock); vlan = batadv_meshif_vlan_get(bat_priv, vid); if (vlan) { batadv_meshif_vlan_put(vlan); spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return -EEXIST; } vlan = kzalloc_obj(*vlan, GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->meshif_vlan_list); spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->mesh_iface, bat_priv->mesh_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new meshif_vlan */ batadv_meshif_vlan_put(vlan); return 0; } /** * batadv_meshif_destroy_vlan() - remove and destroy a meshif_vlan object * @bat_priv: the bat priv with all the mesh interface information * @vlan: the object to remove */ static void batadv_meshif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_meshif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->mesh_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_meshif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_meshif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* VID 0 is only used to indicate "priority tag" frames which only * contain priority information and no VID. No management structures * should be created for this VID and it should be handled like an * untagged frame. */ if (vid == 0) return 0; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_meshif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) return batadv_meshif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->mesh_iface, bat_priv->mesh_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_meshif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* "priority tag" frames are handled like "untagged" frames * and no meshif_vlan needs to be destroyed */ if (vid == 0) return 0; vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_meshif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_meshif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_meshif_init_late() - late stage initialization of mesh interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_meshif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->mesh_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, BATADV_MAX_MTU); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); spin_lock_init(&bat_priv->bla.num_requests_lock); #endif atomic_set(&bat_priv->tp_num, 0); WRITE_ONCE(bat_priv->tt.local_changes, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_meshif_slave_add() - Add a slave interface to a batadv_mesh_interface * @dev: batadv_mesh_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_meshif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->mesh_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_meshif_slave_del() - Delete a slave iface from a batadv_mesh_interface * @dev: batadv_mesh_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_meshif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->mesh_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_meshif_init_late, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_meshif_slave_add, .ndo_del_slave = batadv_meshif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_MCAST { "mcast_tx" }, { "mcast_tx_bytes" }, { "mcast_tx_local" }, { "mcast_tx_local_bytes" }, { "mcast_rx" }, { "mcast_rx_bytes" }, { "mcast_rx_local" }, { "mcast_rx_local_bytes" }, { "mcast_fwd" }, { "mcast_fwd_bytes" }, #endif #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_meshif_free() - Deconstructor of batadv_mesh_interface * @dev: Device to cleanup and remove */ static void batadv_meshif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_meshif_init_early() - early stage initialization of mesh interface * @dev: registered network device to modify */ static void batadv_meshif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_meshif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netns_immutable = true; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; dev->max_mtu = BATADV_MAX_MTU; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_meshif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_meshif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_meshif_newlink() - pre-initialize and register new batadv link * @dev: network device to register * @params: rtnl newlink parameters * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_meshif_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); struct nlattr **data = params->data; const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_meshif_destroy_netlink() - deletion of batadv_mesh_interface via * netlink * @mesh_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_meshif_destroy_netlink(struct net_device *mesh_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_hard_iface *hard_iface; struct batadv_meshif_vlan *vlan; while (!list_empty(&mesh_iface->adj_list.lower)) { hard_iface = netdev_adjacent_get_private(mesh_iface->adj_list.lower.next); batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_meshif_destroy_vlan(bat_priv, vlan); batadv_meshif_vlan_put(vlan); } unregister_netdevice_queue(mesh_iface, head); } /** * batadv_meshif_is_valid() - Check whether device is a batadv mesh interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_meshif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_meshif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_meshif_validate, .newlink = batadv_meshif_newlink, .dellink = batadv_meshif_destroy_netlink, };
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 // SPDX-License-Identifier: GPL-2.0-only /* * Software WEP encryption implementation * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2003, Instant802 Networks, Inc. * Copyright (C) 2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/random.h> #include <linux/compiler.h> #include <linux/crc32.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wep.h" void ieee80211_wep_init(struct ieee80211_local *local) { /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); } static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) { /* * Fluhrer, Mantin, and Shamir have reported weaknesses in the * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, * 0xff, N) can be used to speedup attacks, so avoid using them. */ if ((iv & 0xff00) == 0xff00) { u8 B = (iv >> 16) & 0xff; if (B >= 3 && B < 3 + keylen) return true; } return false; } static void ieee80211_wep_get_iv(struct ieee80211_local *local, int keylen, int keyidx, u8 *iv) { local->wep_iv++; if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) local->wep_iv += 0x0100; if (!iv) return; *iv++ = (local->wep_iv >> 16) & 0xff; *iv++ = (local->wep_iv >> 8) & 0xff; *iv++ = local->wep_iv & 0xff; *iv++ = keyidx << 6; } static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, struct sk_buff *skb, int keylen, int keyidx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; u8 *newhdr; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN); memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return newhdr + hdrlen; ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); return newhdr + hdrlen; } static void ieee80211_wep_remove_iv(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; hdrlen = ieee80211_hdrlen(hdr->frame_control); memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); } /* Perform WEP encryption using given key. data buffer must have tailroom * for 4-byte ICV. data_len must not include this ICV. Note: this function * does _not_ add IV. data = RC4(data | CRC32(data)) */ int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 icv; icv = cpu_to_le32(~crc32_le(~0, data, data_len)); put_unaligned(icv, (__le32 *)(data + data_len)); arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); return 0; } /* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the * beginning of the buffer 4 bytes of extra space (ICV) in the end of the * buffer will be added. Both IV and ICV will be transmitted, so the * payload length increases with 8 bytes. * * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) */ int ieee80211_wep_encrypt(struct ieee80211_local *local, struct sk_buff *skb, const u8 *key, int keylen, int keyidx) { u8 *iv; size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) return -1; iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data); /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, iv, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key, keylen); /* Add room for ICV */ skb_put(skb, IEEE80211_WEP_ICV_LEN); return ieee80211_wep_encrypt_data(&local->wep_tx_ctx, rc4key, keylen + 3, iv + IEEE80211_WEP_IV_LEN, len); } /* Perform WEP decryption using given key. data buffer includes encrypted * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. * Return 0 on success and -1 on ICV mismatch. */ int ieee80211_wep_decrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 crc; arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); crc = cpu_to_le32(~crc32_le(~0, data, data_len)); if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0) /* ICV mismatch */ return -1; return 0; } /* Perform WEP decryption on given skb. Buffer includes whole WEP part of * the frame: IV (4 bytes), encrypted payload (including SNAP header), * ICV (4 bytes). skb->len includes both IV and ICV. * * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload * is moved to the beginning of the skb and skb length will be reduced. */ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { u32 klen; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; size_t len; int ret = 0; if (!ieee80211_has_protected(hdr->frame_control)) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN) return -1; len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN; keyidx = skb->data[hdrlen + 3] >> 6; if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key->conf.key, key->conf.keylen); if (ieee80211_wep_decrypt_data(&local->wep_rx_ctx, rc4key, klen, skb->data + hdrlen + IEEE80211_WEP_IV_LEN, len)) ret = -1; /* Trim ICV */ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); return ret; } ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc)) return RX_CONTINUE; if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_U_WEP_DEC_FAIL; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_U_NO_IV; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED) && pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_U_NO_ICV; } return RX_CONTINUE; } static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *hw_key = info->control.hw_key; if (!hw_key) { if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) || (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { if (!ieee80211_wep_add_iv(tx->local, skb, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } return 0; } ieee80211_tx_result ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (wep_encrypt_skb(tx, skb) < 0) { I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); return TX_DROP; } } return TX_CONTINUE; }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. */ #include <linux/blkdev.h> #include <linux/device.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/mm.h> #include "nd-core.h" #include "btt.h" #include "nd.h" static void nd_btt_release(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); struct nd_btt *nd_btt = to_nd_btt(dev); dev_dbg(dev, "trace\n"); nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns); ida_free(&nd_region->btt_ida, nd_btt->id); kfree(nd_btt->uuid); kfree(nd_btt); } struct nd_btt *to_nd_btt(struct device *dev) { struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev); WARN_ON(!is_nd_btt(dev)); return nd_btt; } EXPORT_SYMBOL(to_nd_btt); static const unsigned long btt_lbasize_supported[] = { 512, 520, 528, 4096, 4104, 4160, 4224, 0 }; static ssize_t sector_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_btt *nd_btt = to_nd_btt(dev); return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf); } static ssize_t sector_size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; guard(device)(dev); guard(nvdimm_bus)(dev); rc = nd_size_select_store(dev, buf, &nd_btt->lbasize, btt_lbasize_supported); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); return rc ? rc : len; } static DEVICE_ATTR_RW(sector_size); static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_btt *nd_btt = to_nd_btt(dev); if (nd_btt->uuid) return sprintf(buf, "%pUb\n", nd_btt->uuid); return sprintf(buf, "\n"); } static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; device_lock(dev); rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); device_unlock(dev); return rc ? rc : len; } static DEVICE_ATTR_RW(uuid); static ssize_t namespace_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_btt *nd_btt = to_nd_btt(dev); guard(nvdimm_bus)(dev); return sprintf(buf, "%s\n", nd_btt->ndns ? dev_name(&nd_btt->ndns->dev) : ""); } static ssize_t namespace_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; guard(device)(dev); guard(nvdimm_bus)(dev); rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); return rc; } static DEVICE_ATTR_RW(namespace); static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nd_btt *nd_btt = to_nd_btt(dev); ssize_t rc; device_lock(dev); if (dev->driver) rc = sprintf(buf, "%llu\n", nd_btt->size); else { /* no size to convey if the btt instance is disabled */ rc = -ENXIO; } device_unlock(dev); return rc; } static DEVICE_ATTR_RO(size); static ssize_t log_zero_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "Y\n"); } static DEVICE_ATTR_RO(log_zero_flags); static struct attribute *nd_btt_attributes[] = { &dev_attr_sector_size.attr, &dev_attr_namespace.attr, &dev_attr_uuid.attr, &dev_attr_size.attr, &dev_attr_log_zero_flags.attr, NULL, }; static struct attribute_group nd_btt_attribute_group = { .attrs = nd_btt_attributes, }; static const struct attribute_group *nd_btt_attribute_groups[] = { &nd_btt_attribute_group, &nd_device_attribute_group, &nd_numa_attribute_group, NULL, }; static const struct device_type nd_btt_device_type = { .name = "nd_btt", .release = nd_btt_release, .groups = nd_btt_attribute_groups, }; bool is_nd_btt(struct device *dev) { return dev->type == &nd_btt_device_type; } EXPORT_SYMBOL(is_nd_btt); static struct lock_class_key nvdimm_btt_key; static struct device *__nd_btt_create(struct nd_region *nd_region, unsigned long lbasize, uuid_t *uuid, struct nd_namespace_common *ndns) { struct nd_btt *nd_btt; struct device *dev; nd_btt = kzalloc_obj(*nd_btt); if (!nd_btt) return NULL; nd_btt->id = ida_alloc(&nd_region->btt_ida, GFP_KERNEL); if (nd_btt->id < 0) goto out_nd_btt; nd_btt->lbasize = lbasize; if (uuid) { uuid = kmemdup(uuid, 16, GFP_KERNEL); if (!uuid) goto out_put_id; } nd_btt->uuid = uuid; dev = &nd_btt->dev; dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id); dev->parent = &nd_region->dev; dev->type = &nd_btt_device_type; device_initialize(&nd_btt->dev); lockdep_set_class(&nd_btt->dev.mutex, &nvdimm_btt_key); if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) { dev_dbg(&ndns->dev, "failed, already claimed by %s\n", dev_name(ndns->claim)); put_device(dev); return NULL; } return dev; out_put_id: ida_free(&nd_region->btt_ida, nd_btt->id); out_nd_btt: kfree(nd_btt); return NULL; } struct device *nd_btt_create(struct nd_region *nd_region) { struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL); nd_device_register(dev); return dev; } /** * nd_btt_arena_is_valid - check if the metadata layout is valid * @nd_btt: device with BTT geometry and backing device info * @super: pointer to the arena's info block being tested * * Check consistency of the btt info block with itself by validating * the checksum, and with the parent namespace by verifying the * parent_uuid contained in the info block with the one supplied in. * * Returns: * false for an invalid info block, true for a valid one */ bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super) { const uuid_t *ns_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); uuid_t parent_uuid; u64 checksum; if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0) return false; import_uuid(&parent_uuid, super->parent_uuid); if (!uuid_is_null(&parent_uuid)) if (!uuid_equal(&parent_uuid, ns_uuid)) return false; checksum = le64_to_cpu(super->checksum); super->checksum = 0; if (checksum != nd_sb_checksum((struct nd_gen_sb *) super)) return false; super->checksum = cpu_to_le64(checksum); /* TODO: figure out action for this */ if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) dev_info(&nd_btt->dev, "Found arena with an error flag\n"); return true; } EXPORT_SYMBOL(nd_btt_arena_is_valid); int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, struct btt_sb *btt_sb) { if (ndns->claim_class == NVDIMM_CCLASS_BTT2) { /* Probe/setup for BTT v2.0 */ nd_btt->initial_offset = 0; nd_btt->version_major = 2; nd_btt->version_minor = 0; if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0)) return -ENXIO; if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) return -ENODEV; if ((le16_to_cpu(btt_sb->version_major) != 2) || (le16_to_cpu(btt_sb->version_minor) != 0)) return -ENODEV; } else { /* * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or * NVDIMM_CCLASS_BTT) */ nd_btt->initial_offset = SZ_4K; nd_btt->version_major = 1; nd_btt->version_minor = 1; if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0)) return -ENXIO; if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) return -ENODEV; if ((le16_to_cpu(btt_sb->version_major) != 1) || (le16_to_cpu(btt_sb->version_minor) != 1)) return -ENODEV; } return 0; } EXPORT_SYMBOL(nd_btt_version); static int __nd_btt_probe(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, struct btt_sb *btt_sb) { int rc; if (!btt_sb || !ndns || !nd_btt) return -ENODEV; if (nvdimm_namespace_capacity(ndns) < SZ_16M) return -ENXIO; rc = nd_btt_version(nd_btt, ndns, btt_sb); if (rc < 0) return rc; nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); nd_btt->uuid = kmemdup(&btt_sb->uuid, sizeof(uuid_t), GFP_KERNEL); if (!nd_btt->uuid) return -ENOMEM; nd_device_register(&nd_btt->dev); return 0; } int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns) { int rc; struct device *btt_dev; struct btt_sb *btt_sb; struct nd_region *nd_region = to_nd_region(ndns->dev.parent); if (ndns->force_raw) return -ENODEV; switch (ndns->claim_class) { case NVDIMM_CCLASS_NONE: case NVDIMM_CCLASS_BTT: case NVDIMM_CCLASS_BTT2: break; default: return -ENODEV; } scoped_guard(nvdimm_bus, &ndns->dev) btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns); if (!btt_dev) return -ENOMEM; btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL); rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb); dev_dbg(dev, "btt: %s\n", rc == 0 ? dev_name(btt_dev) : "<none>"); if (rc < 0) { struct nd_btt *nd_btt = to_nd_btt(btt_dev); nd_detach_ndns(btt_dev, &nd_btt->ndns); put_device(btt_dev); } return rc; } EXPORT_SYMBOL(nd_btt_probe);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/crc32c.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "log.h" #include "lops.h" #include "meta_io.h" #include "util.h" #include "dir.h" #include "trace_gfs2.h" #include "trans.h" #include "aops.h" static void gfs2_log_shutdown(struct gfs2_sbd *sdp); /** * gfs2_struct2blk - compute stuff * @sdp: the filesystem * @nstruct: the number of structures * * Compute the number of log descriptor blocks needed to hold a certain number * of structures of a certain size. * * Returns: the number of blocks needed (minimum is always 1) */ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) { unsigned int blks; unsigned int first, second; /* The initial struct gfs2_log_descriptor block */ blks = 1; first = sdp->sd_ldptrs; if (nstruct > first) { /* Subsequent struct gfs2_meta_header blocks */ second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } return blks; } /** * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters * @bd: The gfs2_bufdata to remove * * The ail lock _must_ be held when calling this function * */ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); brelse(bd->bd_bh); } /** * gfs2_ail1_start_one - Start I/O on a transaction * @sdp: The superblock * @wbc: The writeback control structure * @tr: The transaction to start I/O on * @plug: The block plug currently active */ static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, struct gfs2_trans *tr, struct blk_plug *plug) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { struct gfs2_glock *gl = NULL; struct address_space *mapping; struct gfs2_bufdata *bd, *s; struct buffer_head *bh; int ret = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { if (buffer_uptodate(bh)) { list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); continue; } if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) gfs2_io_error_bh(sdp, bh); } if (gfs2_withdrawn(sdp)) { gfs2_remove_from_ail(bd); continue; } if (!buffer_dirty(bh)) continue; if (gl == bd->bd_gl) continue; gl = bd->bd_gl; list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); mapping = bh->b_folio->mapping; if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); BUG_ON(GFS2_SB(mapping->host) != sdp); if (gfs2_is_jdata(GFS2_I(mapping->host))) ret = gfs2_jdata_writeback(mapping, wbc); else ret = mapping->a_ops->writepages(mapping, wbc); if (need_resched()) { blk_finish_plug(plug); cond_resched(); blk_start_plug(plug); } spin_lock(&sdp->sd_ail_lock); if (ret == -ENODATA) /* if a jdata write into a new hole */ ret = 0; /* ignore it */ mapping_set_error(mapping, ret); if (ret || wbc->nr_to_write <= 0) break; return -EBUSY; } return ret; } static void dump_ail_list(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; struct gfs2_bufdata *bd; struct buffer_head *bh; list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry_reverse(bd, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd, (unsigned long long)bd->bd_blkno, bh); if (!bh) { fs_err(sdp, "\n"); continue; } fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d " "map:%d new:%d ar:%d aw:%d delay:%d " "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n", (unsigned long long)bh->b_blocknr, buffer_uptodate(bh), buffer_dirty(bh), buffer_locked(bh), buffer_req(bh), buffer_mapped(bh), buffer_new(bh), buffer_async_read(bh), buffer_async_write(bh), buffer_delay(bh), buffer_write_io_error(bh), buffer_unwritten(bh), buffer_defer_completion(bh), buffer_pinned(bh), buffer_escaped(bh)); } } } /** * gfs2_ail1_flush - start writeback of some ail1 entries * @sdp: The super block * @wbc: The writeback control structure * * Writes back some ail1 entries, according to the limits in the * writeback control structure */ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; struct gfs2_trans *tr; struct blk_plug plug; int ret; unsigned long flush_start = jiffies; trace_gfs2_ail_flush(sdp, wbc, 1); blk_start_plug(&plug); spin_lock(&sdp->sd_ail_lock); restart: ret = 0; if (time_after(jiffies, flush_start + (HZ * 600))) { fs_err(sdp, "Error: In %s for ten minutes! t=%d\n", __func__, current->journal_info ? 1 : 0); dump_ail_list(sdp); goto out; } list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; ret = gfs2_ail1_start_one(sdp, wbc, tr, &plug); if (ret) { if (ret == -EBUSY) goto restart; break; } } out: spin_unlock(&sdp->sd_ail_lock); blk_finish_plug(&plug); if (ret) { gfs2_lm(sdp, "gfs2_ail1_start_one returned: %d\n", ret); gfs2_withdraw(sdp); } trace_gfs2_ail_flush(sdp, wbc, 0); } /** * gfs2_ail1_start - start writeback of all ail1 entries * @sdp: The superblock */ static void gfs2_ail1_start(struct gfs2_sbd *sdp) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, }; return gfs2_ail1_flush(sdp, &wbc); } static void gfs2_log_update_flush_tail(struct gfs2_sbd *sdp) { unsigned int new_flush_tail = sdp->sd_log_head; struct gfs2_trans *tr; if (!list_empty(&sdp->sd_ail1_list)) { tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans, tr_list); new_flush_tail = tr->tr_first; } sdp->sd_log_flush_tail = new_flush_tail; } static void gfs2_log_update_head(struct gfs2_sbd *sdp) { unsigned int new_head = sdp->sd_log_flush_head; if (sdp->sd_log_flush_tail == sdp->sd_log_head) sdp->sd_log_flush_tail = new_head; sdp->sd_log_head = new_head; } /* * gfs2_ail_empty_tr - empty one of the ail lists of a transaction */ static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, struct list_head *head) { struct gfs2_bufdata *bd; while (!list_empty(head)) { bd = list_first_entry(head, struct gfs2_bufdata, bd_ail_st_list); gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } } /** * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced * @sdp: the filesystem * @tr: the transaction * @max_revokes: If nonzero, issue revokes for the bd items for written buffers * * returns: the transaction's count of remaining active items */ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, int *max_revokes) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; int active_count = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; gfs2_assert(sdp, bd->bd_tr == tr); /* * If another process flagged an io error, e.g. writing to the * journal, error all other bhs and move them off the ail1 to * prevent a tight loop when unmount tries to flush ail1, * regardless of whether they're still busy. If no outside * errors were found and the buffer is busy, move to the next. * If the ail buffer is not busy and caught an error, flag it * for others. */ if (!sdp->sd_log_error && buffer_busy(bh)) { active_count++; continue; } if (!buffer_uptodate(bh) && !cmpxchg(&sdp->sd_log_error, 0, -EIO)) gfs2_io_error_bh(sdp, bh); /* * If we have space for revokes and the bd is no longer on any * buf list, we can just add a revoke for it immediately and * avoid having to put it on the ail2 list, where it would need * to be revoked later. */ if (*max_revokes && list_empty(&bd->bd_list)) { gfs2_add_revoke(sdp, bd); (*max_revokes)--; continue; } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } return active_count; } /** * gfs2_ail1_empty - Try to empty the ail1 lists * @sdp: The superblock * @max_revokes: If non-zero, add revokes where appropriate * * Tries to empty the ail1 lists, starting with the oldest first. * Returns %true if the ail1 list is now empty. */ static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) { struct gfs2_trans *tr, *s; int oldest_tr = 1; bool empty; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else oldest_tr = 0; } gfs2_log_update_flush_tail(sdp); empty = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); return empty; } static void gfs2_ail1_wait(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; struct gfs2_bufdata *bd; struct buffer_head *bh; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; if (!buffer_locked(bh)) continue; get_bh(bh); spin_unlock(&sdp->sd_ail_lock); wait_on_buffer(bh); brelse(bh); return; } } spin_unlock(&sdp->sd_ail_lock); } static void __ail2_empty(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); gfs2_trans_free(sdp, tr); } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { struct list_head *ail2_list = &sdp->sd_ail2_list; unsigned int old_tail = sdp->sd_log_tail; struct gfs2_trans *tr, *safe; spin_lock(&sdp->sd_ail_lock); if (old_tail <= new_tail) { list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { if (old_tail <= tr->tr_first && tr->tr_first < new_tail) __ail2_empty(sdp, tr); } } else { list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { if (old_tail <= tr->tr_first || tr->tr_first < new_tail) __ail2_empty(sdp, tr); } } spin_unlock(&sdp->sd_ail_lock); } /** * gfs2_log_is_empty - Check if the log is empty * @sdp: The GFS2 superblock */ bool gfs2_log_is_empty(struct gfs2_sbd *sdp) { return atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks; } static bool __gfs2_log_try_reserve_revokes(struct gfs2_sbd *sdp, unsigned int revokes) { unsigned int available; available = atomic_read(&sdp->sd_log_revokes_available); while (available >= revokes) { if (atomic_try_cmpxchg(&sdp->sd_log_revokes_available, &available, available - revokes)) return true; } return false; } /** * gfs2_log_release_revokes - Release a given number of revokes * @sdp: The GFS2 superblock * @revokes: The number of revokes to release * * sdp->sd_log_flush_lock must be held. */ void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes) { if (revokes) atomic_add(revokes, &sdp->sd_log_revokes_available); } /** * gfs2_log_release - Release a given number of log blocks * @sdp: The GFS2 superblock * @blks: The number of blocks * */ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) { atomic_add(blks, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, blks); gfs2_assert_withdraw(sdp, !sdp->sd_jdesc || atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); if (atomic_read(&sdp->sd_log_blks_needed)) wake_up(&sdp->sd_log_waitq); } /** * __gfs2_log_try_reserve - Try to make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve * @taboo_blks: The number of blocks to leave free * * Try to do the same as __gfs2_log_reserve(), but fail if no more log * space is immediately available. */ static bool __gfs2_log_try_reserve(struct gfs2_sbd *sdp, unsigned int blks, unsigned int taboo_blks) { unsigned wanted = blks + taboo_blks; unsigned int free_blocks; free_blocks = atomic_read(&sdp->sd_log_blks_free); while (free_blocks >= wanted) { if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, &free_blocks, free_blocks - blks)) { trace_gfs2_log_blocks(sdp, -blks); return true; } } return false; } /** * __gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve * @taboo_blks: The number of blocks to leave free * * @taboo_blks is set to 0 for logd, and to GFS2_LOG_FLUSH_MIN_BLOCKS * for all other processes. This ensures that when the log is almost full, * logd will still be able to call gfs2_log_flush one more time without * blocking, which will advance the tail and make some more log space * available. * * We no longer flush the log here, instead we wake up logd to do that * for us. To avoid the thundering herd and to ensure that we deal fairly * with queued waiters, we use an exclusive wait. This means that when we * get woken with enough journal space to get our reservation, we need to * wake the next waiter on the list. */ static void __gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks, unsigned int taboo_blks) { unsigned wanted = blks + taboo_blks; unsigned int free_blocks; atomic_add(blks, &sdp->sd_log_blks_needed); for (;;) { if (current != sdp->sd_logd_process) wake_up(&sdp->sd_logd_waitq); io_wait_event(sdp->sd_log_waitq, (free_blocks = atomic_read(&sdp->sd_log_blks_free), free_blocks >= wanted)); do { if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, &free_blocks, free_blocks - blks)) goto reserved; } while (free_blocks >= wanted); } reserved: trace_gfs2_log_blocks(sdp, -blks); if (atomic_sub_return(blks, &sdp->sd_log_blks_needed)) wake_up(&sdp->sd_log_waitq); } /** * gfs2_log_try_reserve - Try to make a log reservation * @sdp: The GFS2 superblock * @tr: The transaction * @extra_revokes: The number of additional revokes reserved (output) * * This is similar to gfs2_log_reserve, but sdp->sd_log_flush_lock must be * held for correct revoke accounting. */ bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes) { unsigned int blks = tr->tr_reserved; unsigned int revokes = tr->tr_revokes; unsigned int revoke_blks = 0; *extra_revokes = 0; if (revokes && !__gfs2_log_try_reserve_revokes(sdp, revokes)) { revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; blks += revoke_blks; } if (!blks) return true; if (__gfs2_log_try_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS)) return true; if (!revoke_blks) gfs2_log_release_revokes(sdp, revokes); return false; } /** * gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @tr: The transaction * @extra_revokes: The number of additional revokes reserved (output) * * sdp->sd_log_flush_lock must not be held. */ void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes) { unsigned int blks = tr->tr_reserved; unsigned int revokes = tr->tr_revokes; unsigned int revoke_blks; *extra_revokes = 0; if (revokes) { revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; blks += revoke_blks; } __gfs2_log_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS); } /** * log_distance - Compute distance between two journal blocks * @sdp: The GFS2 superblock * @newer: The most recent journal block of the pair * @older: The older journal block of the pair * * Compute the distance (in the journal direction) between two * blocks in the journal * * Returns: the distance in blocks */ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer, unsigned int older) { int dist; dist = newer - older; if (dist < 0) dist += sdp->sd_jdesc->jd_blocks; return dist; } /** * calc_reserved - Calculate the number of blocks to keep reserved * @sdp: The GFS2 superblock * * This is complex. We need to reserve room for all our currently used * metadata blocks (e.g. normal file I/O rewriting file time stamps) and * all our journaled data blocks for journaled files (e.g. files in the * meta_fs like rindex, or files for which chattr +j was done.) * If we don't reserve enough space, corruption will follow. * * We can have metadata blocks and jdata blocks in the same journal. Each * type gets its own log descriptor, for which we need to reserve a block. * In fact, each type has the potential for needing more than one log descriptor * in cases where we have more blocks than will fit in a log descriptor. * Metadata journal entries take up half the space of journaled buffer entries. * * Also, we need to reserve blocks for revoke journal entries and one for an * overall header for the lot. * * Returns: the number of blocks reserved */ static unsigned int calc_reserved(struct gfs2_sbd *sdp) { unsigned int reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; unsigned int blocks; struct gfs2_trans *tr = sdp->sd_log_tr; if (tr) { blocks = tr->tr_num_buf_new - tr->tr_num_buf_rm; reserved += blocks + DIV_ROUND_UP(blocks, buf_limit(sdp)); blocks = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; reserved += blocks + DIV_ROUND_UP(blocks, databuf_limit(sdp)); } return reserved; } static void log_pull_tail(struct gfs2_sbd *sdp) { unsigned int new_tail = sdp->sd_log_flush_tail; unsigned int dist; if (new_tail == sdp->sd_log_tail) return; dist = log_distance(sdp, new_tail, sdp->sd_log_tail); ail2_empty(sdp, new_tail); gfs2_log_release(sdp, dist); sdp->sd_log_tail = new_tail; } void log_flush_wait(struct gfs2_sbd *sdp) { DEFINE_WAIT(wait); if (atomic_read(&sdp->sd_log_in_flight)) { do { prepare_to_wait(&sdp->sd_log_flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&sdp->sd_log_in_flight)) io_schedule(); } while(atomic_read(&sdp->sd_log_in_flight)); finish_wait(&sdp->sd_log_flush_wait, &wait); } } static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_inode *ipa, *ipb; ipa = list_entry(a, struct gfs2_inode, i_ordered); ipb = list_entry(b, struct gfs2_inode, i_ordered); if (ipa->i_no_addr < ipb->i_no_addr) return -1; if (ipa->i_no_addr > ipb->i_no_addr) return 1; return 0; } static void __ordered_del_inode(struct gfs2_inode *ip) { if (!list_empty(&ip->i_ordered)) list_del_init(&ip->i_ordered); } static void gfs2_ordered_write(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; LIST_HEAD(written); spin_lock(&sdp->sd_ordered_lock); list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_ordered)) { ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); if (ip->i_inode.i_mapping->nrpages == 0) { __ordered_del_inode(ip); continue; } list_move(&ip->i_ordered, &written); spin_unlock(&sdp->sd_ordered_lock); filemap_fdatawrite(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); } list_splice(&written, &sdp->sd_log_ordered); spin_unlock(&sdp->sd_ordered_lock); } static void gfs2_ordered_wait(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_ordered)) { ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); __ordered_del_inode(ip); if (ip->i_inode.i_mapping->nrpages == 0) continue; spin_unlock(&sdp->sd_ordered_lock); filemap_fdatawait(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); } spin_unlock(&sdp->sd_ordered_lock); } void gfs2_ordered_del_inode(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); spin_lock(&sdp->sd_ordered_lock); __ordered_del_inode(ip); spin_unlock(&sdp->sd_ordered_lock); } void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; sdp->sd_log_num_revoke++; if (atomic_inc_return(&gl->gl_revokes) == 1) gfs2_glock_hold(gl); bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_revokes); } void gfs2_glock_remove_revoke(struct gfs2_glock *gl) { if (atomic_dec_return(&gl->gl_revokes) == 0) { clear_bit(GLF_LFLUSH, &gl->gl_flags); gfs2_glock_put_async(gl); } } /** * gfs2_flush_revokes - Add as many revokes to the system transaction as we can * @sdp: The GFS2 superblock * * Our usual strategy is to defer writing revokes as much as we can in the hope * that we'll eventually overwrite the journal, which will make those revokes * go away. This changes when we flush the log: at that point, there will * likely be some left-over space in the last revoke block of that transaction. * We can fill that space with additional revokes for blocks that have already * been written back. This will basically come at no cost now, and will save * us from having to keep track of those blocks on the AIL2 list later. */ void gfs2_flush_revokes(struct gfs2_sbd *sdp) { /* number of revokes we still have room for */ unsigned int max_revokes = atomic_read(&sdp->sd_log_revokes_available); spin_lock(&sdp->sd_log_lock); gfs2_ail1_empty(sdp, max_revokes); spin_unlock(&sdp->sd_log_lock); } /** * gfs2_write_log_header - Write a journal log header buffer at lblock * @sdp: The GFS2 superblock * @jd: journal descriptor of the journal to which we are writing * @seq: sequence number * @tail: tail of the log * @lblock: value for lh_blkno (block number relative to start of journal) * @flags: log header flags GFS2_LOG_HEAD_* * @op_flags: flags to pass to the bio * * Returns: the initialized log buffer descriptor */ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, blk_opf_t op_flags) { struct gfs2_log_header *lh; u32 hash, crc; struct page *page; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct timespec64 tv; struct super_block *sb = sdp->sd_vfs; u64 dblock; if (gfs2_withdrawn(sdp)) return; page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); clear_page(lh); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); lh->lh_sequence = cpu_to_be64(seq); lh->lh_flags = cpu_to_be32(flags); lh->lh_tail = cpu_to_be32(tail); lh->lh_blkno = cpu_to_be32(lblock); hash = ~crc32(~0, lh, LH_V1_SIZE); lh->lh_hash = cpu_to_be32(hash); ktime_get_coarse_real_ts64(&tv); lh->lh_nsec = cpu_to_be32(tv.tv_nsec); lh->lh_sec = cpu_to_be64(tv.tv_sec); if (!list_empty(&jd->extent_list)) dblock = gfs2_log_bmap(jd, lblock); else { unsigned int extlen; int ret; extlen = 1; ret = gfs2_get_extent(jd->jd_inode, lblock, &dblock, &extlen); if (gfs2_assert_withdraw(sdp, ret == 0)) return; } lh->lh_addr = cpu_to_be64(dblock); lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr); /* We may only write local statfs, quota, etc., when writing to our own journal. The values are left 0 when recovering a journal different from our own. */ if (!(flags & GFS2_LOG_HEAD_RECOVERY)) { lh->lh_statfs_addr = cpu_to_be64(GFS2_I(sdp->sd_sc_inode)->i_no_addr); lh->lh_quota_addr = cpu_to_be64(GFS2_I(sdp->sd_qc_inode)->i_no_addr); spin_lock(&sdp->sd_statfs_spin); lh->lh_local_total = cpu_to_be64(l_sc->sc_total); lh->lh_local_free = cpu_to_be64(l_sc->sc_free); lh->lh_local_dinodes = cpu_to_be64(l_sc->sc_dinodes); spin_unlock(&sdp->sd_statfs_spin); } BUILD_BUG_ON(offsetof(struct gfs2_log_header, lh_crc) != LH_V1_SIZE); crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, sb->s_blocksize - LH_V1_SIZE - 4); lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, jd, page, sb->s_blocksize, 0, dblock, REQ_OP_WRITE | op_flags); gfs2_log_submit_write(&jd->jd_log_bio); } /** * log_write_header - Get and initialize a journal header buffer * @sdp: The GFS2 superblock * @flags: The log header flags, including log header origin * * Returns: the initialized log buffer descriptor */ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; struct super_block *sb = sdp->sd_vfs; gfs2_assert_withdraw(sdp, sb->s_writers.frozen != SB_FREEZE_COMPLETE); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (sdp->sd_log_flush_tail == sdp->sd_log_flush_head); gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, sdp->sd_log_flush_tail, sdp->sd_log_flush_head, flags, op_flags); gfs2_log_incr_head(sdp); log_flush_wait(sdp); log_pull_tail(sdp); gfs2_log_update_head(sdp); } /** * gfs2_ail_drain - drain the ail lists after a withdraw * @sdp: Pointer to GFS2 superblock */ void gfs2_ail_drain(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; spin_lock(&sdp->sd_ail_lock); /* * For transactions on the sd_ail1_list we need to drain both the * ail1 and ail2 lists. That's because function gfs2_ail1_start_one * (temporarily) moves items from its tr_ail1 list to tr_ail2 list * before revokes are sent for that block. Items on the sd_ail2_list * should have already gotten beyond that point, so no need. */ while (!list_empty(&sdp->sd_ail1_list)) { tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans, tr_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); gfs2_trans_free(sdp, tr); } while (!list_empty(&sdp->sd_ail2_list)) { tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans, tr_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); gfs2_trans_free(sdp, tr); } gfs2_drain_revokes(sdp); spin_unlock(&sdp->sd_ail_lock); } /** * empty_ail1_list - try to start IO and empty the ail1 list * @sdp: Pointer to GFS2 superblock */ static void empty_ail1_list(struct gfs2_sbd *sdp) { unsigned long start = jiffies; bool empty = false; while (!empty) { if (time_after(jiffies, start + (HZ * 600))) { fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", __func__, current->journal_info ? 1 : 0); dump_ail_list(sdp); return; } gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); empty = gfs2_ail1_empty(sdp, 0); if (gfs2_withdrawn(sdp)) break; } } static void gfs2_trans_drain_list(struct gfs2_sbd *sdp, struct list_head *list) { struct gfs2_bufdata *bd; while (!list_empty(list)) { bd = list_first_entry(list, struct gfs2_bufdata, bd_list); struct buffer_head *bh = bd->bd_bh; WARN_ON_ONCE(!buffer_pinned(bh)); clear_buffer_pinned(bh); trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); brelse(bh); } } /** * gfs2_trans_drain - drain the buf and databuf queue for a failed transaction * @sdp: the filesystem * @tr: the transaction to drain * * When this is called, we're taking an error exit for a log write that failed * but since we bypassed the after_commit functions, we need to remove the * items from the buf and databuf queue. */ static void gfs2_trans_drain(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { if (!tr) return; gfs2_trans_drain_list(sdp, &tr->tr_buf); gfs2_trans_drain_list(sdp, &tr->tr_databuf); } void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { struct address_space *mapping = bh->b_folio->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; struct gfs2_trans *tr = current->journal_info; int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); if (tr) { if (meta == REMOVE_META) tr->tr_num_buf_rm++; else tr->tr_num_databuf_rm++; set_bit(TR_TOUCHED, &tr->tr_flags); } was_pinned = 1; brelse(bh); } if (bd) { if (bd->bd_tr) { if (tr) gfs2_trans_add_revoke(sdp, bd); else gfs2_remove_from_ail(bd); } else if (was_pinned) { bh->b_private = NULL; kmem_cache_free(gfs2_bufdata_cachep, bd); } else if (!list_empty(&bd->bd_ail_st_list) && !list_empty(&bd->bd_ail_gl_list)) { gfs2_remove_from_ail(bd); } } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); } /** * __gfs2_log_flush - flush incore transaction(s) * @sdp: The filesystem * @gl: The glock structure to flush. If NULL, flush the whole incore log * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags * */ static void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { struct gfs2_trans *tr = NULL; unsigned int reserved_blocks = 0, used_blocks = 0; bool frozen = test_bit(SDF_FROZEN, &sdp->sd_flags); unsigned int first_log_head; unsigned int reserved_revokes = 0; trace_gfs2_log_flush(sdp, 1, flags); repeat: /* * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() */ if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) goto out; /* Log might have been flushed while we waited for the flush lock */ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) goto out; first_log_head = sdp->sd_log_head; sdp->sd_log_flush_head = first_log_head; tr = sdp->sd_log_tr; if (tr || sdp->sd_log_num_revoke) { if (reserved_blocks) gfs2_log_release(sdp, reserved_blocks); reserved_blocks = sdp->sd_log_blks_reserved; reserved_revokes = sdp->sd_log_num_revoke; if (tr) { sdp->sd_log_tr = NULL; tr->tr_first = first_log_head; if (unlikely(frozen)) { if (gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) goto out_withdraw; } } } else if (!reserved_blocks) { unsigned int taboo_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; reserved_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; if (current == sdp->sd_logd_process) taboo_blocks = 0; if (!__gfs2_log_try_reserve(sdp, reserved_blocks, taboo_blocks)) { up_write(&sdp->sd_log_flush_lock); __gfs2_log_reserve(sdp, reserved_blocks, taboo_blocks); down_write(&sdp->sd_log_flush_lock); goto repeat; } BUG_ON(sdp->sd_log_num_revoke); } if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); if (unlikely(frozen)) if (gfs2_assert_withdraw(sdp, !reserved_revokes)) goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) goto out_withdraw; lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_jdesc) gfs2_log_submit_write(&sdp->sd_jdesc->jd_log_bio); if (gfs2_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_write_header(sdp, flags); } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) goto out_withdraw; lops_after_commit(sdp, tr); spin_lock(&sdp->sd_log_lock); sdp->sd_log_blks_reserved = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { list_add(&tr->tr_list, &sdp->sd_ail1_list); tr = NULL; } spin_unlock(&sdp->sd_ail_lock); spin_unlock(&sdp->sd_log_lock); if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) goto out_withdraw; log_write_header(sdp, flags); } if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LOG_HEAD_FLUSH_FREEZE)) gfs2_log_shutdown(sdp); } out_end: used_blocks = log_distance(sdp, sdp->sd_log_flush_head, first_log_head); reserved_revokes += atomic_read(&sdp->sd_log_revokes_available); atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); gfs2_assert_withdraw(sdp, reserved_revokes % sdp->sd_inptrs == sdp->sd_ldptrs); if (reserved_revokes > sdp->sd_ldptrs) reserved_blocks += (reserved_revokes - sdp->sd_ldptrs) / sdp->sd_inptrs; out: if (used_blocks != reserved_blocks) { gfs2_assert_withdraw(sdp, used_blocks < reserved_blocks); gfs2_log_release(sdp, reserved_blocks - used_blocks); } gfs2_trans_free(sdp, tr); trace_gfs2_log_flush(sdp, 0, flags); return; out_withdraw: if (sdp->sd_jdesc->jd_log_bio) { bio_io_error(sdp->sd_jdesc->jd_log_bio); sdp->sd_jdesc->jd_log_bio = NULL; } gfs2_trans_drain(sdp, tr); /** * If the tr_list is empty, we're withdrawing during a log * flush that targets a transaction, but the transaction was * never queued onto any of the ail lists. Here we add it to * ail1 just so that ail_drain() will find and free it. */ spin_lock(&sdp->sd_ail_lock); if (tr && list_empty(&tr->tr_list)) list_add(&tr->tr_list, &sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); tr = NULL; goto out_end; } void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { down_write(&sdp->sd_log_flush_lock); __gfs2_log_flush(sdp, gl, flags); up_write(&sdp->sd_log_flush_lock); } /** * gfs2_merge_trans - Merge a new transaction into a cached transaction * @sdp: the filesystem * @new: New transaction to be merged */ static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) { struct gfs2_trans *old = sdp->sd_log_tr; WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags)); old->tr_num_buf_new += new->tr_num_buf_new; old->tr_num_databuf_new += new->tr_num_databuf_new; old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; old->tr_revokes += new->tr_revokes; old->tr_num_revoke += new->tr_num_revoke; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); spin_lock(&sdp->sd_ail_lock); list_splice_tail_init(&new->tr_ail1_list, &old->tr_ail1_list); list_splice_tail_init(&new->tr_ail2_list, &old->tr_ail2_list); spin_unlock(&sdp->sd_ail_lock); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int reserved; unsigned int unused; unsigned int maxres; spin_lock(&sdp->sd_log_lock); if (sdp->sd_log_tr) { gfs2_merge_trans(sdp, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { gfs2_assert_withdraw(sdp, !test_bit(TR_ONSTACK, &tr->tr_flags)); sdp->sd_log_tr = tr; set_bit(TR_ATTACHED, &tr->tr_flags); } reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); unused = maxres - reserved; if (unused) gfs2_log_release(sdp, unused); sdp->sd_log_blks_reserved = reserved; spin_unlock(&sdp->sd_log_lock); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { return atomic_read(&sdp->sd_log_pinned) + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh1); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { return sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free) + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } /** * gfs2_log_commit - Commit a transaction to the log * @sdp: the filesystem * @tr: the transaction * * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 * or the total number of used blocks (pinned blocks plus AIL blocks) * is greater than thresh2. * * At mount time thresh1 is 2/5ths of journal size, thresh2 is 4/5ths of * journal size. * * Returns: errno */ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); if (gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp)) wake_up(&sdp->sd_logd_waitq); } /** * gfs2_log_shutdown - write a shutdown header into a journal * @sdp: the filesystem * */ static void gfs2_log_shutdown(struct gfs2_sbd *sdp) { gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN); log_pull_tail(sdp); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); } /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks * @data: Pointer to GFS2 superblock * * Also, periodically check to make sure that we're using the most recent * journal index. */ int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; unsigned long t = 1; set_freezable(); while (!kthread_should_stop()) { if (gfs2_withdrawn(sdp)) break; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { down_write(&sdp->sd_log_flush_lock); gfs2_ail1_empty(sdp, 0); __gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_LOGD_JFLUSH_REQD); up_write(&sdp->sd_log_flush_lock); } if (test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || gfs2_ail_flush_reqd(sdp)) { clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); down_write(&sdp->sd_log_flush_lock); gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp, 0); __gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_LOGD_AIL_FLUSH_REQD); up_write(&sdp->sd_log_flush_lock); } t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; t = wait_event_freezable_timeout(sdp->sd_logd_waitq, test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp) || gfs2_withdrawn(sdp) || kthread_should_stop(), t); } return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #ifndef RXE_HDR_H #define RXE_HDR_H /* extracted information about a packet carried in an sk_buff struct fits in * the skbuff cb array. Must be at most 48 bytes. stored in control block of * sk_buff for received packets. */ struct rxe_pkt_info { struct rxe_dev *rxe; /* device that owns packet */ struct rxe_qp *qp; /* qp that owns packet */ struct rxe_send_wqe *wqe; /* send wqe */ u8 *hdr; /* points to bth */ u32 mask; /* useful info about pkt */ u32 psn; /* bth psn of packet */ u16 pkey_index; /* partition of pkt */ u16 paylen; /* length of bth - icrc */ u8 port_num; /* port pkt received on */ u8 opcode; /* bth opcode of packet */ }; /* Macros should be used only for received skb */ static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb)); return (void *)skb->cb; } static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt) { return container_of((void *)pkt, struct sk_buff, cb); } /* * IBA header types and methods * * Some of these are for reference and completeness only since * rxe does not currently support RD transport * most of this could be moved into IB core. ib_pack.h has * part of this but is incomplete * * Header specific routines to insert/extract values to/from headers * the routines that are named __hhh_(set_)fff() take a pointer to a * hhh header and get(set) the fff field. The routines named * hhh_(set_)fff take a packet info struct and find the * header and field based on the opcode in the packet. * Conversion to/from network byte order from cpu order is also done. */ #define RXE_ICRC_SIZE (4) #define RXE_MAX_HDR_LENGTH (80) /****************************************************************************** * Base Transport Header ******************************************************************************/ struct rxe_bth { u8 opcode; u8 flags; __be16 pkey; __be32 qpn; __be32 apsn; }; #define BTH_TVER (0) #define BTH_DEF_PKEY (0xffff) #define BTH_SE_MASK (0x80) #define BTH_MIG_MASK (0x40) #define BTH_PAD_MASK (0x30) #define BTH_TVER_MASK (0x0f) #define BTH_FECN_MASK (0x80000000) #define BTH_BECN_MASK (0x40000000) #define BTH_RESV6A_MASK (0x3f000000) #define BTH_QPN_MASK (0x00ffffff) #define BTH_ACK_MASK (0x80000000) #define BTH_RESV7_MASK (0x7f000000) #define BTH_PSN_MASK (0x00ffffff) static inline u8 __bth_opcode(void *arg) { struct rxe_bth *bth = arg; return bth->opcode; } static inline void __bth_set_opcode(void *arg, u8 opcode) { struct rxe_bth *bth = arg; bth->opcode = opcode; } static inline u8 __bth_se(void *arg) { struct rxe_bth *bth = arg; return 0 != (BTH_SE_MASK & bth->flags); } static inline void __bth_set_se(void *arg, int se) { struct rxe_bth *bth = arg; if (se) bth->flags |= BTH_SE_MASK; else bth->flags &= ~BTH_SE_MASK; } static inline u8 __bth_mig(void *arg) { struct rxe_bth *bth = arg; return 0 != (BTH_MIG_MASK & bth->flags); } static inline void __bth_set_mig(void *arg, u8 mig) { struct rxe_bth *bth = arg; if (mig) bth->flags |= BTH_MIG_MASK; else bth->flags &= ~BTH_MIG_MASK; } static inline u8 __bth_pad(void *arg) { struct rxe_bth *bth = arg; return (BTH_PAD_MASK & bth->flags) >> 4; } static inline void __bth_set_pad(void *arg, u8 pad) { struct rxe_bth *bth = arg; bth->flags = (BTH_PAD_MASK & (pad << 4)) | (~BTH_PAD_MASK & bth->flags); } static inline u8 __bth_tver(void *arg) { struct rxe_bth *bth = arg; return BTH_TVER_MASK & bth->flags; } static inline void __bth_set_tver(void *arg, u8 tver) { struct rxe_bth *bth = arg; bth->flags = (BTH_TVER_MASK & tver) | (~BTH_TVER_MASK & bth->flags); } static inline u16 __bth_pkey(void *arg) { struct rxe_bth *bth = arg; return be16_to_cpu(bth->pkey); } static inline void __bth_set_pkey(void *arg, u16 pkey) { struct rxe_bth *bth = arg; bth->pkey = cpu_to_be16(pkey); } static inline u32 __bth_qpn(void *arg) { struct rxe_bth *bth = arg; return BTH_QPN_MASK & be32_to_cpu(bth->qpn); } static inline void __bth_set_qpn(void *arg, u32 qpn) { struct rxe_bth *bth = arg; u32 resvqpn = be32_to_cpu(bth->qpn); bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | (~BTH_QPN_MASK & resvqpn)); } static inline int __bth_fecn(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); } static inline void __bth_set_fecn(void *arg, int fecn) { struct rxe_bth *bth = arg; if (fecn) bth->qpn |= cpu_to_be32(BTH_FECN_MASK); else bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); } static inline int __bth_becn(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); } static inline void __bth_set_becn(void *arg, int becn) { struct rxe_bth *bth = arg; if (becn) bth->qpn |= cpu_to_be32(BTH_BECN_MASK); else bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); } static inline u8 __bth_resv6a(void *arg) { struct rxe_bth *bth = arg; return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; } static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) { struct rxe_bth *bth = arg; return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); } static inline void __bth_set_ack(void *arg, int ack) { struct rxe_bth *bth = arg; if (ack) bth->apsn |= cpu_to_be32(BTH_ACK_MASK); else bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); } static inline void __bth_set_resv7(void *arg) { struct rxe_bth *bth = arg; bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); } static inline u32 __bth_psn(void *arg) { struct rxe_bth *bth = arg; return BTH_PSN_MASK & be32_to_cpu(bth->apsn); } static inline void __bth_set_psn(void *arg, u32 psn) { struct rxe_bth *bth = arg; u32 apsn = be32_to_cpu(bth->apsn); bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | (~BTH_PSN_MASK & apsn)); } static inline u8 bth_opcode(struct rxe_pkt_info *pkt) { return __bth_opcode(pkt->hdr); } static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) { __bth_set_opcode(pkt->hdr, opcode); } static inline u8 bth_se(struct rxe_pkt_info *pkt) { return __bth_se(pkt->hdr); } static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) { __bth_set_se(pkt->hdr, se); } static inline u8 bth_mig(struct rxe_pkt_info *pkt) { return __bth_mig(pkt->hdr); } static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) { __bth_set_mig(pkt->hdr, mig); } static inline u8 bth_pad(struct rxe_pkt_info *pkt) { return __bth_pad(pkt->hdr); } static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) { __bth_set_pad(pkt->hdr, pad); } static inline u8 bth_tver(struct rxe_pkt_info *pkt) { return __bth_tver(pkt->hdr); } static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) { __bth_set_tver(pkt->hdr, tver); } static inline u16 bth_pkey(struct rxe_pkt_info *pkt) { return __bth_pkey(pkt->hdr); } static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) { __bth_set_pkey(pkt->hdr, pkey); } static inline u32 bth_qpn(struct rxe_pkt_info *pkt) { return __bth_qpn(pkt->hdr); } static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) { __bth_set_qpn(pkt->hdr, qpn); } static inline int bth_fecn(struct rxe_pkt_info *pkt) { return __bth_fecn(pkt->hdr); } static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) { __bth_set_fecn(pkt->hdr, fecn); } static inline int bth_becn(struct rxe_pkt_info *pkt) { return __bth_becn(pkt->hdr); } static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) { __bth_set_becn(pkt->hdr, becn); } static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) { return __bth_resv6a(pkt->hdr); } static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) { __bth_set_resv6a(pkt->hdr); } static inline int bth_ack(struct rxe_pkt_info *pkt) { return __bth_ack(pkt->hdr); } static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) { __bth_set_ack(pkt->hdr, ack); } static inline void bth_set_resv7(struct rxe_pkt_info *pkt) { __bth_set_resv7(pkt->hdr); } static inline u32 bth_psn(struct rxe_pkt_info *pkt) { return __bth_psn(pkt->hdr); } static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) { __bth_set_psn(pkt->hdr, psn); } static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, int mig, int pad, u16 pkey, u32 qpn, int ack_req, u32 psn) { struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr); bth->opcode = opcode; bth->flags = (pad << 4) & BTH_PAD_MASK; if (se) bth->flags |= BTH_SE_MASK; if (mig) bth->flags |= BTH_MIG_MASK; bth->pkey = cpu_to_be16(pkey); bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); psn &= BTH_PSN_MASK; if (ack_req) psn |= BTH_ACK_MASK; bth->apsn = cpu_to_be32(psn); } /****************************************************************************** * Reliable Datagram Extended Transport Header ******************************************************************************/ struct rxe_rdeth { __be32 een; }; #define RDETH_EEN_MASK (0x00ffffff) static inline u8 __rdeth_een(void *arg) { struct rxe_rdeth *rdeth = arg; return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); } static inline void __rdeth_set_een(void *arg, u32 een) { struct rxe_rdeth *rdeth = arg; rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); } static inline u8 rdeth_een(struct rxe_pkt_info *pkt) { return __rdeth_een(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); } static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) { __rdeth_set_een(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); } /****************************************************************************** * Datagram Extended Transport Header ******************************************************************************/ struct rxe_deth { __be32 qkey; __be32 sqp; }; #define GSI_QKEY (0x80010000) #define DETH_SQP_MASK (0x00ffffff) static inline u32 __deth_qkey(void *arg) { struct rxe_deth *deth = arg; return be32_to_cpu(deth->qkey); } static inline void __deth_set_qkey(void *arg, u32 qkey) { struct rxe_deth *deth = arg; deth->qkey = cpu_to_be32(qkey); } static inline u32 __deth_sqp(void *arg) { struct rxe_deth *deth = arg; return DETH_SQP_MASK & be32_to_cpu(deth->sqp); } static inline void __deth_set_sqp(void *arg, u32 sqp) { struct rxe_deth *deth = arg; deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); } static inline u32 deth_qkey(struct rxe_pkt_info *pkt) { return __deth_qkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH]); } static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) { __deth_set_qkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); } static inline u32 deth_sqp(struct rxe_pkt_info *pkt) { return __deth_sqp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH]); } static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) { __deth_set_sqp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); } /****************************************************************************** * RDMA Extended Transport Header ******************************************************************************/ struct rxe_reth { __be64 va; __be32 rkey; __be32 len; }; static inline u64 __reth_va(void *arg) { struct rxe_reth *reth = arg; return be64_to_cpu(reth->va); } static inline void __reth_set_va(void *arg, u64 va) { struct rxe_reth *reth = arg; reth->va = cpu_to_be64(va); } static inline u32 __reth_rkey(void *arg) { struct rxe_reth *reth = arg; return be32_to_cpu(reth->rkey); } static inline void __reth_set_rkey(void *arg, u32 rkey) { struct rxe_reth *reth = arg; reth->rkey = cpu_to_be32(rkey); } static inline u32 __reth_len(void *arg) { struct rxe_reth *reth = arg; return be32_to_cpu(reth->len); } static inline void __reth_set_len(void *arg, u32 len) { struct rxe_reth *reth = arg; reth->len = cpu_to_be32(len); } static inline u64 reth_va(struct rxe_pkt_info *pkt) { return __reth_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) { __reth_set_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); } static inline u32 reth_rkey(struct rxe_pkt_info *pkt) { return __reth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __reth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); } static inline u32 reth_len(struct rxe_pkt_info *pkt) { return __reth_len(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH]); } static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) { __reth_set_len(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); } /****************************************************************************** * FLUSH Extended Transport Header ******************************************************************************/ struct rxe_feth { __be32 bits; }; #define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */ #define FETH_SEL_MASK (0x00000030) /* bits 5-4 */ #define FETH_SEL_SHIFT (4U) static inline u32 __feth_plt(void *arg) { struct rxe_feth *feth = arg; return be32_to_cpu(feth->bits) & FETH_PLT_MASK; } static inline u32 __feth_sel(void *arg) { struct rxe_feth *feth = arg; return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT; } static inline u32 feth_plt(struct rxe_pkt_info *pkt) { return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); } static inline u32 feth_sel(struct rxe_pkt_info *pkt) { return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); } static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level) { struct rxe_feth *feth = (struct rxe_feth *) (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) | (type & FETH_PLT_MASK); feth->bits = cpu_to_be32(bits); } /****************************************************************************** * Atomic Extended Transport Header ******************************************************************************/ struct rxe_atmeth { __be64 va; __be32 rkey; __be64 swap_add; __be64 comp; } __packed; static inline u64 __atmeth_va(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->va); } static inline void __atmeth_set_va(void *arg, u64 va) { struct rxe_atmeth *atmeth = arg; atmeth->va = cpu_to_be64(va); } static inline u32 __atmeth_rkey(void *arg) { struct rxe_atmeth *atmeth = arg; return be32_to_cpu(atmeth->rkey); } static inline void __atmeth_set_rkey(void *arg, u32 rkey) { struct rxe_atmeth *atmeth = arg; atmeth->rkey = cpu_to_be32(rkey); } static inline u64 __atmeth_swap_add(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->swap_add); } static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) { struct rxe_atmeth *atmeth = arg; atmeth->swap_add = cpu_to_be64(swap_add); } static inline u64 __atmeth_comp(void *arg) { struct rxe_atmeth *atmeth = arg; return be64_to_cpu(atmeth->comp); } static inline void __atmeth_set_comp(void *arg, u64 comp) { struct rxe_atmeth *atmeth = arg; atmeth->comp = cpu_to_be64(comp); } static inline u64 atmeth_va(struct rxe_pkt_info *pkt) { return __atmeth_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) { __atmeth_set_va(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); } static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) { return __atmeth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __atmeth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); } static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) { return __atmeth_swap_add(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) { __atmeth_set_swap_add(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); } static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) { return __atmeth_comp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); } static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) { __atmeth_set_comp(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); } /****************************************************************************** * Ack Extended Transport Header ******************************************************************************/ struct rxe_aeth { __be32 smsn; }; #define AETH_SYN_MASK (0xff000000) #define AETH_MSN_MASK (0x00ffffff) enum aeth_syndrome { AETH_TYPE_MASK = 0xe0, AETH_ACK = 0x00, AETH_RNR_NAK = 0x20, AETH_RSVD = 0x40, AETH_NAK = 0x60, AETH_ACK_UNLIMITED = 0x1f, AETH_NAK_PSN_SEQ_ERROR = 0x60, AETH_NAK_INVALID_REQ = 0x61, AETH_NAK_REM_ACC_ERR = 0x62, AETH_NAK_REM_OP_ERR = 0x63, }; static inline u8 __aeth_syn(void *arg) { struct rxe_aeth *aeth = arg; return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; } static inline void __aeth_set_syn(void *arg, u8 syn) { struct rxe_aeth *aeth = arg; u32 smsn = be32_to_cpu(aeth->smsn); aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | (~AETH_SYN_MASK & smsn)); } static inline u32 __aeth_msn(void *arg) { struct rxe_aeth *aeth = arg; return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); } static inline void __aeth_set_msn(void *arg, u32 msn) { struct rxe_aeth *aeth = arg; u32 smsn = be32_to_cpu(aeth->smsn); aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | (~AETH_MSN_MASK & smsn)); } static inline u8 aeth_syn(struct rxe_pkt_info *pkt) { return __aeth_syn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH]); } static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) { __aeth_set_syn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); } static inline u32 aeth_msn(struct rxe_pkt_info *pkt) { return __aeth_msn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH]); } static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) { __aeth_set_msn(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); } /****************************************************************************** * Atomic Ack Extended Transport Header ******************************************************************************/ struct rxe_atmack { __be64 orig; }; static inline u64 __atmack_orig(void *arg) { struct rxe_atmack *atmack = arg; return be64_to_cpu(atmack->orig); } static inline void __atmack_set_orig(void *arg, u64 orig) { struct rxe_atmack *atmack = arg; atmack->orig = cpu_to_be64(orig); } static inline u64 atmack_orig(struct rxe_pkt_info *pkt) { return __atmack_orig(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); } static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) { __atmack_set_orig(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); } /****************************************************************************** * Immediate Extended Transport Header ******************************************************************************/ struct rxe_immdt { __be32 imm; }; static inline __be32 __immdt_imm(void *arg) { struct rxe_immdt *immdt = arg; return immdt->imm; } static inline void __immdt_set_imm(void *arg, __be32 imm) { struct rxe_immdt *immdt = arg; immdt->imm = imm; } static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) { return __immdt_imm(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); } static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) { __immdt_set_imm(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); } /****************************************************************************** * Invalidate Extended Transport Header ******************************************************************************/ struct rxe_ieth { __be32 rkey; }; static inline u32 __ieth_rkey(void *arg) { struct rxe_ieth *ieth = arg; return be32_to_cpu(ieth->rkey); } static inline void __ieth_set_rkey(void *arg, u32 rkey) { struct rxe_ieth *ieth = arg; ieth->rkey = cpu_to_be32(rkey); } static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) { return __ieth_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IETH]); } static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) { __ieth_set_rkey(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); } enum rxe_hdr_length { RXE_BTH_BYTES = sizeof(struct rxe_bth), RXE_DETH_BYTES = sizeof(struct rxe_deth), RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), RXE_RETH_BYTES = sizeof(struct rxe_reth), RXE_AETH_BYTES = sizeof(struct rxe_aeth), RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), RXE_IETH_BYTES = sizeof(struct rxe_ieth), RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), RXE_FETH_BYTES = sizeof(struct rxe_feth), }; static inline size_t header_size(struct rxe_pkt_info *pkt) { return rxe_opcode[pkt->opcode].length; } static inline void *payload_addr(struct rxe_pkt_info *pkt) { return pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; } static inline size_t payload_size(struct rxe_pkt_info *pkt) { return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] - bth_pad(pkt) - RXE_ICRC_SIZE; } #endif /* RXE_HDR_H */
8 8 3 2 10 11 11 11 29 3 10 27 4 4 4 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 20 3 18 18 18 18 18 18 18 18 5 5 5 5 5 18 17 17 18 18 20 6 6 3 3 3 3 6 6 6 5 4 1 1 1 4 4 4 4 4 4 4 4 5 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 // SPDX-License-Identifier: GPL-2.0 /* Linux multicast routing support * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ #include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { RCU_INIT_POINTER(v->dev, NULL); v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; v->pkt_out = 0; v->rate_limit = rate_limit; v->flags = flags; v->threshold = threshold; if (v->flags & get_iflink_mask) v->link = dev_get_iflink(dev); else v->link = dev->ifindex; } static void __mr_free_table(struct work_struct *work) { struct mr_table *mrt = container_of(to_rcu_work(work), struct mr_table, work); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } void mr_table_free(struct mr_table *mrt) { queue_rcu_work(system_unbound_wq, &mrt->work); } struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) { struct mr_table *mrt; int err; mrt = kzalloc_obj(*mrt); if (!mrt) return ERR_PTR(-ENOMEM); mrt->id = id; write_pnet(&mrt->net, net); mrt->ops = *ops; err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); if (err) { kfree(mrt); return ERR_PTR(err); } INIT_RCU_WORK(&mrt->work, __mr_free_table); INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); mrt->mroute_reg_vif_num = -1; table_set(mrt, net); return mrt; } void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (parent == -1 || parent == c->mfc_parent) return c; return NULL; } void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[vifi] < 255) return c; return NULL; } void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { struct rhlist_head *tmp, *list; struct mr_mfc *c, *proxy; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { if (c->mfc_un.res.ttls[vifi] < 255) return c; /* It's ok if the vifi is part of the static tree */ proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) return c; } return mr_mfc_find_any_parent(mrt, vifi); } #ifdef CONFIG_PROC_FS void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { struct mr_table *mrt = iter->mrt; for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { if (!VIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) return &mrt->vif_table[iter->ct]; } return NULL; } void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return mr_vif_seq_idx(net, iter, 0); while (++iter->ct < mrt->maxvif) { if (!VIF_EXISTS(mrt, iter->ct)) continue; return &mrt->vif_table[iter->ct]; } return NULL; } void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { struct mr_table *mrt = it->mrt; struct mr_mfc *mfc; rcu_read_lock(); it->cache = &mrt->mfc_cache_list; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) return mfc; rcu_read_unlock(); spin_lock_bh(it->lock); it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = it->mrt; struct mr_mfc *c = v; ++*pos; if (v == SEQ_START_TOKEN) return mr_mfc_seq_idx(net, seq->private, 0); if (c->list.next != it->cache) return list_entry(c->list.next, struct mr_mfc, list); if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; /* exhausted cache_array, show unresolved */ rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; spin_lock_bh(it->lock); if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mr_mfc, list); end_of_list: spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } #endif int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { struct net_device *vif_dev; struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) { rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; } rcu_read_lock(); vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); if (vif_dev && nla_put_u32(skb, RTA_IIF, READ_ONCE(vif_dev->ifindex)) < 0) { rcu_read_unlock(); return -EMSGSIZE; } rcu_read_unlock(); if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; rcu_read_lock(); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { struct vif_device *vif = &mrt->vif_table[ct]; vif_dev = rcu_dereference(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { rcu_read_unlock(); nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = READ_ONCE(vif_dev->ifindex); nhp->rtnh_len = sizeof(*nhp); } } rcu_read_unlock(); nla_nest_end(skb, mp_attr); lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt); mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes); mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if); if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; return 1; } static bool mr_mfc_uses_dev(const struct mr_table *mrt, const struct mr_mfc *c, const struct net_device *dev) { int ct; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { const struct net_device *vif_dev; const struct vif_device *vif; vif = &mrt->vif_table[ct]; vif_dev = rcu_access_pointer(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255 && vif_dev == dev) return true; } return false; } int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int e = 0, s_e = cb->args[1]; unsigned int flags = NLM_F_MULTI; struct mr_mfc *mfc; int err; if (filter->filter_set) flags |= NLM_F_DUMP_FILTERED; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list, lockdep_rtnl_is_held()) { if (e < s_e) goto next_entry; if (filter->dev && !mr_mfc_uses_dev(mrt, mfc, filter->dev)) goto next_entry; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) goto out; next_entry: e++; } spin_lock_bh(lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) { spin_unlock_bh(lock); goto out; } next_entry2: e++; } spin_unlock_bh(lock); err = 0; out: cb->args[1] = e; return err; } int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int t = 0, s_t = cb->args[0]; struct net *net = sock_net(skb->sk); struct mr_table *mrt; int err; /* multicast does not track protocol or have route type other * than RTN_MULTICAST */ if (filter->filter_set) { if (filter->protocol || filter->flags || (filter->rt_type && filter->rt_type != RTN_MULTICAST)) return skb->len; } rcu_read_lock(); for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { if (t < s_t) goto next_table; err = mr_table_dump(mrt, skb, cb, fill, lock, filter); if (err < 0) break; cb->args[1] = 0; next_table: t++; } rcu_read_unlock(); cb->args[0] = t; return skb->len; } int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; err = rules_dump(net, nb, extack); if (err) return err; for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { struct vif_device *v = &mrt->vif_table[0]; struct net_device *vif_dev; struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ rcu_read_lock(); for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { vif_dev = rcu_dereference(v->dev); if (!vif_dev) continue; err = mr_call_vif_notifier(nb, family, FIB_EVENT_VIF_ADD, v, vif_dev, vifi, mrt->id, extack); if (err) break; } rcu_read_unlock(); if (err) return err; /* Notify on table MFC entries */ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { err = mr_call_mfc_notifier(nb, family, FIB_EVENT_ENTRY_ADD, mfc, mrt->id, extack); if (err) return err; } } return 0; }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 /* * Copyright (c) 2005 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/hex.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/string.h> #include <linux/parser.h> #include <linux/random.h> #include <linux/jiffies.h> #include <linux/lockdep.h> #include <linux/inet.h> #include <net/net_namespace.h> #include <rdma/ib_cache.h> #include <linux/atomic.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_tcq.h> #include <scsi/srp.h> #include <scsi/scsi_transport_srp.h> #include "ib_srp.h" #define DRV_NAME "ib_srp" #define PFX DRV_NAME ": " MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); MODULE_LICENSE("Dual BSD/GPL"); static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; static bool register_always = true; static bool never_register; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); module_param(cmd_sg_entries, uint, 0444); MODULE_PARM_DESC(cmd_sg_entries, "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); module_param(indirect_sg_entries, uint, 0444); MODULE_PARM_DESC(indirect_sg_entries, "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")"); module_param(allow_ext_sg, bool, 0444); MODULE_PARM_DESC(allow_ext_sg, "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); module_param(never_register, bool, 0444); MODULE_PARM_DESC(never_register, "Never register memory"); static const struct kernel_param_ops srp_tmo_ops; static int srp_reconnect_delay = 10; module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); static int srp_fast_io_fail_tmo = 15; module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(fast_io_fail_tmo, "Number of seconds between the observation of a transport" " layer error and failing all I/O. \"off\" means that this" " functionality is disabled."); static int srp_dev_loss_tmo = 600; module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dev_loss_tmo, "Maximum number of seconds that the SRP transport should" " insulate transport layer errors. After this time has been" " exceeded the SCSI host is removed. Should be" " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) " if fast_io_fail_tmo has not been set. \"off\" means that" " this functionality is disabled."); static bool srp_use_imm_data = true; module_param_named(use_imm_data, srp_use_imm_data, bool, 0644); MODULE_PARM_DESC(use_imm_data, "Whether or not to request permission to use immediate data during SRP login."); static unsigned int srp_max_imm_data = 8 * 1024; module_param_named(max_imm_data, srp_max_imm_data, uint, 0644); MODULE_PARM_DESC(max_imm_data, "Maximum immediate data size."); static unsigned ch_count; module_param(ch_count, uint, 0444); MODULE_PARM_DESC(ch_count, "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA."); static int srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_rename_dev(struct ib_device *device, void *client_data); static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, const char *opname); static int srp_ib_cm_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *event); static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; static struct workqueue_struct *srp_remove_wq; static struct ib_client srp_client = { .name = "srp", .add = srp_add_one, .remove = srp_remove_one, .rename = srp_rename_dev }; static struct ib_sa_client srp_sa_client; static int srp_tmo_get(char *buffer, const struct kernel_param *kp) { int tmo = *(int *)kp->arg; if (tmo >= 0) return sysfs_emit(buffer, "%d\n", tmo); else return sysfs_emit(buffer, "off\n"); } static int srp_tmo_set(const char *val, const struct kernel_param *kp) { int tmo, res; res = srp_parse_tmo(&tmo, val); if (res) goto out; if (kp->arg == &srp_reconnect_delay) res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, srp_dev_loss_tmo); else if (kp->arg == &srp_fast_io_fail_tmo) res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); else res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, tmo); if (res) goto out; *(int *)kp->arg = tmo; out: return res; } static const struct kernel_param_ops srp_tmo_ops = { .get = srp_tmo_get, .set = srp_tmo_set, }; static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; } static const char *srp_target_info(struct Scsi_Host *host) { return host_to_target(host)->target_name; } static int srp_target_is_topspin(struct srp_target_port *target) { static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad }; static const u8 cisco_oui[3] = { 0x00, 0x1b, 0x0d }; return topspin_workarounds && (!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) || !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); } static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, gfp_t gfp_mask, enum dma_data_direction direction) { struct srp_iu *iu; iu = kmalloc_obj(*iu, gfp_mask); if (!iu) goto out; iu->buf = kzalloc(size, gfp_mask); if (!iu->buf) goto out_free_iu; iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size, direction); if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma)) goto out_free_buf; iu->size = size; iu->direction = direction; return iu; out_free_buf: kfree(iu->buf); out_free_iu: kfree(iu); out: return NULL; } static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) { if (!iu) return; ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size, iu->direction); kfree(iu->buf); kfree(iu); } static void srp_qp_event(struct ib_event *event, void *context) { pr_debug("QP event %s (%d)\n", ib_event_msg(event->event), event->event); } static int srp_init_ib_qp(struct srp_target_port *target, struct ib_qp *qp) { struct ib_qp_attr *attr; int ret; attr = kmalloc_obj(*attr); if (!attr) return -ENOMEM; ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, target->srp_host->port, be16_to_cpu(target->ib_cm.pkey), &attr->pkey_index); if (ret) goto out; attr->qp_state = IB_QPS_INIT; attr->qp_access_flags = (IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE); attr->port_num = target->srp_host->port; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_ACCESS_FLAGS | IB_QP_PORT); out: kfree(attr); return ret; } static int srp_new_ib_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct ib_cm_id *new_cm_id; new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, srp_ib_cm_handler, ch); if (IS_ERR(new_cm_id)) return PTR_ERR(new_cm_id); if (ch->ib_cm.cm_id) ib_destroy_cm_id(ch->ib_cm.cm_id); ch->ib_cm.cm_id = new_cm_id; if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev, target->srp_host->port)) ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA; else ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB; ch->ib_cm.path.sgid = target->sgid; ch->ib_cm.path.dgid = target->ib_cm.orig_dgid; ch->ib_cm.path.pkey = target->ib_cm.pkey; ch->ib_cm.path.service_id = target->ib_cm.service_id; return 0; } static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct rdma_cm_id *new_cm_id; int ret; new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { ret = PTR_ERR(new_cm_id); new_cm_id = NULL; goto out; } init_completion(&ch->done); ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ? &target->rdma_cm.src.sa : NULL, &target->rdma_cm.dst.sa, SRP_PATH_REC_TIMEOUT_MS); if (ret) { pr_err("No route available from %pISpsc to %pISpsc (%d)\n", &target->rdma_cm.src, &target->rdma_cm.dst, ret); goto out; } ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) goto out; ret = ch->status; if (ret) { pr_err("Resolving address %pISpsc failed (%d)\n", &target->rdma_cm.dst, ret); goto out; } swap(ch->rdma_cm.cm_id, new_cm_id); out: if (new_cm_id) rdma_destroy_id(new_cm_id); return ret; } static int srp_new_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) : srp_new_ib_cm_id(ch); } /** * srp_destroy_fr_pool() - free the resources owned by a pool * @pool: Fast registration pool to be destroyed. */ static void srp_destroy_fr_pool(struct srp_fr_pool *pool) { int i; struct srp_fr_desc *d; if (!pool) return; for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { if (d->mr) ib_dereg_mr(d->mr); } kfree(pool); } /** * srp_create_fr_pool() - allocate and initialize a pool for fast registration * @device: IB device to allocate fast registration descriptors for. * @pd: Protection domain associated with the FR descriptors. * @pool_size: Number of descriptors to allocate. * @max_page_list_len: Maximum fast registration work request page list length. */ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, struct ib_pd *pd, int pool_size, int max_page_list_len) { struct srp_fr_pool *pool; struct srp_fr_desc *d; struct ib_mr *mr; int i, ret = -EINVAL; enum ib_mr_type mr_type; if (pool_size <= 0) goto err; ret = -ENOMEM; pool = kzalloc_flex(*pool, desc, pool_size); if (!pool) goto err; pool->size = pool_size; pool->max_page_list_len = max_page_list_len; spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->free_list); if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) mr_type = IB_MR_TYPE_SG_GAPS; else mr_type = IB_MR_TYPE_MEM_REG; for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { mr = ib_alloc_mr(pd, mr_type, max_page_list_len); if (IS_ERR(mr)) { ret = PTR_ERR(mr); if (ret == -ENOMEM) pr_info("%s: ib_alloc_mr() failed. Try to reduce max_cmd_per_lun, max_sect or ch_count\n", dev_name(&device->dev)); goto destroy_pool; } d->mr = mr; list_add_tail(&d->entry, &pool->free_list); } out: return pool; destroy_pool: srp_destroy_fr_pool(pool); err: pool = ERR_PTR(ret); goto out; } /** * srp_fr_pool_get() - obtain a descriptor suitable for fast registration * @pool: Pool to obtain descriptor from. */ static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) { struct srp_fr_desc *d = NULL; unsigned long flags; spin_lock_irqsave(&pool->lock, flags); if (!list_empty(&pool->free_list)) { d = list_first_entry(&pool->free_list, typeof(*d), entry); list_del(&d->entry); } spin_unlock_irqrestore(&pool->lock, flags); return d; } /** * srp_fr_pool_put() - put an FR descriptor back in the free list * @pool: Pool the descriptor was allocated from. * @desc: Pointer to an array of fast registration descriptor pointers. * @n: Number of descriptors to put back. * * Note: The caller must already have queued an invalidation request for * desc->mr->rkey before calling this function. */ static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, int n) { unsigned long flags; int i; spin_lock_irqsave(&pool->lock, flags); for (i = 0; i < n; i++) list_add(&desc[i]->entry, &pool->free_list); spin_unlock_irqrestore(&pool->lock, flags); } static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size, dev->max_pages_per_mr); } /** * srp_destroy_qp() - destroy an RDMA queue pair * @ch: SRP RDMA channel. * * Drain the qp before destroying it. This avoids that the receive * completion handler can access the queue pair while it is * being destroyed. */ static void srp_destroy_qp(struct srp_rdma_ch *ch) { spin_lock_irq(&ch->lock); ib_process_cq_direct(ch->send_cq, -1); spin_unlock_irq(&ch->lock); ib_drain_qp(ch->qp); ib_destroy_qp(ch->qp); } static int srp_create_ch_ib(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; const struct ib_device_attr *attr = &dev->dev->attrs; struct ib_qp_init_attr *init_attr; struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; struct srp_fr_pool *fr_pool = NULL; const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; int ret; init_attr = kzalloc_obj(*init_attr); if (!init_attr) return -ENOMEM; /* queue_size + 1 for ib_drain_rq() */ recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, ch->comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size, ch->comp_vector, IB_POLL_DIRECT); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; } init_attr->event_handler = srp_qp_event; init_attr->cap.max_send_wr = m * target->queue_size; init_attr->cap.max_recv_wr = target->queue_size + 1; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = min(SRP_MAX_SGE, attr->max_send_sge); init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; init_attr->qp_type = IB_QPT_RC; init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; ch->max_imm_sge = min(init_attr->cap.max_send_sge - 1U, 255U); if (target->using_rdma_cm) { ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr); qp = ch->rdma_cm.cm_id->qp; } else { qp = ib_create_qp(dev->pd, init_attr); if (!IS_ERR(qp)) { ret = srp_init_ib_qp(target, qp); if (ret) ib_destroy_qp(qp); } else { ret = PTR_ERR(qp); } } if (ret) { pr_err("QP creation failed for dev %s: %d\n", dev_name(&dev->dev->dev), ret); goto err_send_cq; } if (dev->use_fast_reg) { fr_pool = srp_alloc_fr_pool(target); if (IS_ERR(fr_pool)) { ret = PTR_ERR(fr_pool); shost_printk(KERN_WARNING, target->scsi_host, PFX "FR pool allocation failed (%d)\n", ret); goto err_qp; } } if (ch->qp) srp_destroy_qp(ch); if (ch->recv_cq) ib_free_cq(ch->recv_cq); if (ch->send_cq) ib_free_cq(ch->send_cq); ch->qp = qp; ch->recv_cq = recv_cq; ch->send_cq = send_cq; if (dev->use_fast_reg) { if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); ch->fr_pool = fr_pool; } kfree(init_attr); return 0; err_qp: if (target->using_rdma_cm) rdma_destroy_qp(ch->rdma_cm.cm_id); else ib_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); err_recv_cq: ib_free_cq(recv_cq); err: kfree(init_attr); return ret; } /* * Note: this function may be called without srp_alloc_iu_bufs() having been * invoked. Hence the ch->[rt]x_ring checks. */ static void srp_free_ch_ib(struct srp_target_port *target, struct srp_rdma_ch *ch) { struct srp_device *dev = target->srp_host->srp_dev; int i; if (!ch->target) return; if (target->using_rdma_cm) { if (ch->rdma_cm.cm_id) { rdma_destroy_id(ch->rdma_cm.cm_id); ch->rdma_cm.cm_id = NULL; } } else { if (ch->ib_cm.cm_id) { ib_destroy_cm_id(ch->ib_cm.cm_id); ch->ib_cm.cm_id = NULL; } } /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ if (!ch->qp) return; if (dev->use_fast_reg) { if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); } srp_destroy_qp(ch); ib_free_cq(ch->send_cq); ib_free_cq(ch->recv_cq); /* * Avoid that the SCSI error handler tries to use this channel after * it has been freed. The SCSI error handler can namely continue * trying to perform recovery actions after scsi_remove_host() * returned. */ ch->target = NULL; ch->qp = NULL; ch->send_cq = ch->recv_cq = NULL; if (ch->rx_ring) { for (i = 0; i < target->queue_size; ++i) srp_free_iu(target->srp_host, ch->rx_ring[i]); kfree(ch->rx_ring); ch->rx_ring = NULL; } if (ch->tx_ring) { for (i = 0; i < target->queue_size; ++i) srp_free_iu(target->srp_host, ch->tx_ring[i]); kfree(ch->tx_ring); ch->tx_ring = NULL; } } static void srp_path_rec_completion(int status, struct sa_path_rec *pathrec, unsigned int num_paths, void *ch_ptr) { struct srp_rdma_ch *ch = ch_ptr; struct srp_target_port *target = ch->target; ch->status = status; if (status) shost_printk(KERN_ERR, target->scsi_host, PFX "Got failed path rec status %d\n", status); else ch->ib_cm.path = *pathrec; complete(&ch->done); } static int srp_ib_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; int ret; ch->ib_cm.path.numb_path = 1; init_completion(&ch->done); ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client, target->srp_host->srp_dev->dev, target->srp_host->port, &ch->ib_cm.path, IB_SA_PATH_REC_SERVICE_ID | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_PKEY, SRP_PATH_REC_TIMEOUT_MS, GFP_KERNEL, srp_path_rec_completion, ch, &ch->ib_cm.path_query); if (ch->ib_cm.path_query_id < 0) return ch->ib_cm.path_query_id; ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) return ret; if (ch->status < 0) shost_printk(KERN_WARNING, target->scsi_host, PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw, be16_to_cpu(target->ib_cm.pkey), be64_to_cpu(target->ib_cm.service_id)); return ch->status; } static int srp_rdma_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; int ret; init_completion(&ch->done); ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS); if (ret) return ret; wait_for_completion_interruptible(&ch->done); if (ch->status != 0) shost_printk(KERN_WARNING, target->scsi_host, PFX "Path resolution failed\n"); return ch->status; } static int srp_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; return target->using_rdma_cm ? srp_rdma_lookup_path(ch) : srp_ib_lookup_path(ch); } static u8 srp_get_subnet_timeout(struct srp_host *host) { struct ib_port_attr attr; int ret; u8 subnet_timeout = 18; ret = ib_query_port(host->srp_dev->dev, host->port, &attr); if (ret == 0) subnet_timeout = attr.subnet_timeout; if (unlikely(subnet_timeout < 15)) pr_warn("%s: subnet timeout %d may cause SRP login to fail.\n", dev_name(&host->srp_dev->dev->dev), subnet_timeout); return subnet_timeout; } static int srp_send_req(struct srp_rdma_ch *ch, uint32_t max_iu_len, bool multich) { struct srp_target_port *target = ch->target; struct { struct rdma_conn_param rdma_param; struct srp_login_req_rdma rdma_req; struct ib_cm_req_param ib_param; struct srp_login_req ib_req; } *req = NULL; char *ipi, *tpi; int status; req = kzalloc_obj(*req); if (!req) return -ENOMEM; req->ib_param.flow_control = 1; req->ib_param.retry_count = target->tl_retry_count; /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ req->ib_param.responder_resources = 4; req->ib_param.rnr_retry_count = 7; req->ib_param.max_cm_retries = 15; req->ib_req.opcode = SRP_LOGIN_REQ; req->ib_req.tag = 0; req->ib_req.req_it_iu_len = cpu_to_be32(max_iu_len); req->ib_req.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI : SRP_MULTICHAN_SINGLE); if (srp_use_imm_data) { req->ib_req.req_flags |= SRP_IMMED_REQUESTED; req->ib_req.imm_data_offset = cpu_to_be16(SRP_IMM_DATA_OFFSET); } if (target->using_rdma_cm) { req->rdma_param.flow_control = req->ib_param.flow_control; req->rdma_param.responder_resources = req->ib_param.responder_resources; req->rdma_param.initiator_depth = req->ib_param.initiator_depth; req->rdma_param.retry_count = req->ib_param.retry_count; req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count; req->rdma_param.private_data = &req->rdma_req; req->rdma_param.private_data_len = sizeof(req->rdma_req); req->rdma_req.opcode = req->ib_req.opcode; req->rdma_req.tag = req->ib_req.tag; req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len; req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt; req->rdma_req.req_flags = req->ib_req.req_flags; req->rdma_req.imm_data_offset = req->ib_req.imm_data_offset; ipi = req->rdma_req.initiator_port_id; tpi = req->rdma_req.target_port_id; } else { u8 subnet_timeout; subnet_timeout = srp_get_subnet_timeout(target->srp_host); req->ib_param.primary_path = &ch->ib_cm.path; req->ib_param.alternate_path = NULL; req->ib_param.service_id = target->ib_cm.service_id; get_random_bytes(&req->ib_param.starting_psn, 4); req->ib_param.starting_psn &= 0xffffff; req->ib_param.qp_num = ch->qp->qp_num; req->ib_param.qp_type = ch->qp->qp_type; req->ib_param.local_cm_response_timeout = subnet_timeout + 2; req->ib_param.remote_cm_response_timeout = subnet_timeout + 2; req->ib_param.private_data = &req->ib_req; req->ib_param.private_data_len = sizeof(req->ib_req); ipi = req->ib_req.initiator_port_id; tpi = req->ib_req.target_port_id; } /* * In the published SRP specification (draft rev. 16a), the * port identifier format is 8 bytes of ID extension followed * by 8 bytes of GUID. Older drafts put the two halves in the * opposite order, so that the GUID comes first. * * Targets conforming to these obsolete drafts can be * recognized by the I/O Class they report. */ if (target->io_class == SRP_REV10_IB_IO_CLASS) { memcpy(ipi, &target->sgid.global.interface_id, 8); memcpy(ipi + 8, &target->initiator_ext, 8); memcpy(tpi, &target->ioc_guid, 8); memcpy(tpi + 8, &target->id_ext, 8); } else { memcpy(ipi, &target->initiator_ext, 8); memcpy(ipi + 8, &target->sgid.global.interface_id, 8); memcpy(tpi, &target->id_ext, 8); memcpy(tpi + 8, &target->ioc_guid, 8); } /* * Topspin/Cisco SRP targets will reject our login unless we * zero out the first 8 bytes of our initiator port ID and set * the second 8 bytes to the local node GUID. */ if (srp_target_is_topspin(target)) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Topspin/Cisco initiator port ID workaround " "activated for target GUID %016llx\n", be64_to_cpu(target->ioc_guid)); memset(ipi, 0, 8); memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8); } if (target->using_rdma_cm) status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param); else status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param); kfree(req); return status; } static bool srp_queue_remove_work(struct srp_target_port *target) { bool changed = false; spin_lock_irq(&target->lock); if (target->state != SRP_TARGET_REMOVED) { target->state = SRP_TARGET_REMOVED; changed = true; } spin_unlock_irq(&target->lock); if (changed) queue_work(srp_remove_wq, &target->remove_work); return changed; } static void srp_disconnect_target(struct srp_target_port *target) { struct srp_rdma_ch *ch; int i, ret; /* XXX should send SRP_I_LOGOUT request */ for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; ch->connected = false; ret = 0; if (target->using_rdma_cm) { if (ch->rdma_cm.cm_id) rdma_disconnect(ch->rdma_cm.cm_id); } else { if (ch->ib_cm.cm_id) ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0); } if (ret < 0) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM DREQ failed\n"); } } } static int srp_exit_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd) { struct srp_target_port *target = host_to_target(shost); struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; struct srp_request *req = scsi_cmd_priv(cmd); kfree(req->fr_list); if (req->indirect_dma_addr) { ib_dma_unmap_single(ibdev, req->indirect_dma_addr, target->indirect_size, DMA_TO_DEVICE); } kfree(req->indirect_desc); return 0; } static int srp_init_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd) { struct srp_target_port *target = host_to_target(shost); struct srp_device *srp_dev = target->srp_host->srp_dev; struct ib_device *ibdev = srp_dev->dev; struct srp_request *req = scsi_cmd_priv(cmd); dma_addr_t dma_addr; int ret = -ENOMEM; if (srp_dev->use_fast_reg) { req->fr_list = kmalloc_array(target->mr_per_cmd, sizeof(void *), GFP_KERNEL); if (!req->fr_list) goto out; } req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); if (!req->indirect_desc) goto out; dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, target->indirect_size, DMA_TO_DEVICE); if (ib_dma_mapping_error(ibdev, dma_addr)) { srp_exit_cmd_priv(shost, cmd); goto out; } req->indirect_dma_addr = dma_addr; ret = 0; out: return ret; } /** * srp_del_scsi_host_attr() - Remove attributes defined in the host template. * @shost: SCSI host whose attributes to remove from sysfs. * * Note: Any attributes defined in the host template and that did not exist * before invocation of this function will be ignored. */ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) { const struct attribute_group **g; struct attribute **attr; for (g = shost->hostt->shost_groups; *g; ++g) { for (attr = (*g)->attrs; *attr; ++attr) { struct device_attribute *dev_attr = container_of(*attr, typeof(*dev_attr), attr); device_remove_file(&shost->shost_dev, dev_attr); } } } static void srp_remove_target(struct srp_target_port *target) { struct srp_rdma_ch *ch; int i; WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); srp_stop_rport_timers(target->rport); srp_disconnect_target(target); kobj_ns_drop(KOBJ_NS_TYPE_NET, to_ns_common(target->net)); for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; srp_free_ch_ib(target, ch); } cancel_work_sync(&target->tl_err_work); srp_rport_put(target->rport); kfree(target->ch); target->ch = NULL; spin_lock(&target->srp_host->target_lock); list_del(&target->list); spin_unlock(&target->srp_host->target_lock); scsi_host_put(target->scsi_host); } static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = container_of(work, struct srp_target_port, remove_work); WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_remove_target(target); } static void srp_rport_delete(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; srp_queue_remove_work(target); } /** * srp_connected_ch() - number of connected channels * @target: SRP target port. */ static int srp_connected_ch(struct srp_target_port *target) { int i, c = 0; for (i = 0; i < target->ch_count; i++) c += target->ch[i].connected; return c; } static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len, bool multich) { struct srp_target_port *target = ch->target; int ret; WARN_ON_ONCE(!multich && srp_connected_ch(target) > 0); ret = srp_lookup_path(ch); if (ret) goto out; while (1) { init_completion(&ch->done); ret = srp_send_req(ch, max_iu_len, multich); if (ret) goto out; ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) goto out; /* * The CM event handling code will set status to * SRP_PORT_REDIRECT if we get a port redirect REJ * back, or SRP_DLID_REDIRECT if we get a lid/qp * redirect REJ back. */ ret = ch->status; switch (ret) { case 0: ch->connected = true; goto out; case SRP_PORT_REDIRECT: ret = srp_lookup_path(ch); if (ret) goto out; break; case SRP_DLID_REDIRECT: break; case SRP_STALE_CONN: shost_printk(KERN_ERR, target->scsi_host, PFX "giving up on stale connection\n"); ret = -ECONNRESET; goto out; default: goto out; } } out: return ret <= 0 ? ret : -ENODEV; } static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc) { srp_handle_qp_err(cq, wc, "INV RKEY"); } static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch, u32 rkey) { struct ib_send_wr wr = { .opcode = IB_WR_LOCAL_INV, .next = NULL, .num_sge = 0, .send_flags = 0, .ex.invalidate_rkey = rkey, }; wr.wr_cqe = &req->reg_cqe; req->reg_cqe.done = srp_inv_rkey_err_done; return ib_post_send(ch->qp, &wr, NULL); } static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_request *req) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; int i, res; if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; if (dev->use_fast_reg) { struct srp_fr_desc **pfr; for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey); if (res < 0) { shost_printk(KERN_ERR, target->scsi_host, PFX "Queueing INV WR for rkey %#x failed (%d)\n", (*pfr)->mr->rkey, res); queue_work(system_long_wq, &target->tl_err_work); } } if (req->nmdesc) srp_fr_pool_put(ch->fr_pool, req->fr_list, req->nmdesc); } ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), scmnd->sc_data_direction); } /** * srp_claim_req - Take ownership of the scmnd associated with a request. * @ch: SRP RDMA channel. * @req: SRP request. * @sdev: If not NULL, only take ownership for this SCSI device. * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take * ownership of @req->scmnd if it equals @scmnd. * * Return value: * Either NULL or a pointer to the SCSI command the caller became owner of. */ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch, struct srp_request *req, struct scsi_device *sdev, struct scsi_cmnd *scmnd) { unsigned long flags; spin_lock_irqsave(&ch->lock, flags); if (req->scmnd && (!sdev || req->scmnd->device == sdev) && (!scmnd || req->scmnd == scmnd)) { scmnd = req->scmnd; req->scmnd = NULL; } else { scmnd = NULL; } spin_unlock_irqrestore(&ch->lock, flags); return scmnd; } /** * srp_free_req() - Unmap data and adjust ch->req_lim. * @ch: SRP RDMA channel. * @req: Request to be freed. * @scmnd: SCSI command associated with @req. * @req_lim_delta: Amount to be added to @target->req_lim. */ static void srp_free_req(struct srp_rdma_ch *ch, struct srp_request *req, struct scsi_cmnd *scmnd, s32 req_lim_delta) { unsigned long flags; srp_unmap_data(scmnd, ch, req); spin_lock_irqsave(&ch->lock, flags); ch->req_lim += req_lim_delta; spin_unlock_irqrestore(&ch->lock, flags); } static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req, struct scsi_device *sdev, int result) { struct scsi_cmnd *scmnd = srp_claim_req(ch, req, sdev, NULL); if (scmnd) { srp_free_req(ch, req, scmnd, 0); scmnd->result = result; scsi_done(scmnd); } } struct srp_terminate_context { struct srp_target_port *srp_target; int scsi_result; }; static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr) { struct srp_terminate_context *context = context_ptr; struct srp_target_port *target = context->srp_target; u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); struct srp_rdma_ch *ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; struct srp_request *req = scsi_cmd_priv(scmnd); srp_finish_req(ch, req, NULL, context->scsi_result); return true; } static void srp_terminate_io(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; struct srp_terminate_context context = { .srp_target = target, .scsi_result = DID_TRANSPORT_FAILFAST << 16 }; scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, &context); } /* Calculate maximum initiator to target information unit length. */ static uint32_t srp_max_it_iu_len(int cmd_sg_cnt, bool use_imm_data, uint32_t max_it_iu_size) { uint32_t max_iu_len = sizeof(struct srp_cmd) + SRP_MAX_ADD_CDB_LEN + sizeof(struct srp_indirect_buf) + cmd_sg_cnt * sizeof(struct srp_direct_buf); if (use_imm_data) max_iu_len = max(max_iu_len, SRP_IMM_DATA_OFFSET + srp_max_imm_data); if (max_it_iu_size) max_iu_len = min(max_iu_len, max_it_iu_size); pr_debug("max_iu_len = %d\n", max_iu_len); return max_iu_len; } /* * It is up to the caller to ensure that srp_rport_reconnect() calls are * serialized and that no concurrent srp_queuecommand(), srp_abort(), * srp_reset_device() or srp_reset_host() calls will occur while this function * is in progress. One way to realize that is not to call this function * directly but to call srp_reconnect_rport() instead since that last function * serializes calls of this function via rport->mutex and also blocks * srp_queuecommand() calls before invoking this function. */ static int srp_rport_reconnect(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; struct srp_rdma_ch *ch; uint32_t max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, srp_use_imm_data, target->max_it_iu_size); int i, j, ret = 0; bool multich = false; srp_disconnect_target(target); if (target->state == SRP_TARGET_SCANNING) return -ENODEV; /* * Now get a new local CM ID so that we avoid confusing the target in * case things are really fouled up. Doing so also ensures that all CM * callbacks will have finished before a new QP is allocated. */ for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; ret += srp_new_cm_id(ch); } { struct srp_terminate_context context = { .srp_target = target, .scsi_result = DID_RESET << 16}; scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, &context); } for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; /* * Whether or not creating a new CM ID succeeded, create a new * QP. This guarantees that all completion callback function * invocations have finished before request resetting starts. */ ret += srp_create_ch_ib(ch); INIT_LIST_HEAD(&ch->free_tx); for (j = 0; j < target->queue_size; ++j) list_add(&ch->tx_ring[j]->list, &ch->free_tx); } target->qp_in_error = false; for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; if (ret) break; ret = srp_connect_ch(ch, max_iu_len, multich); multich = true; } if (ret == 0) shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n"); return ret; } static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, unsigned int dma_len, u32 rkey) { struct srp_direct_buf *desc = state->desc; WARN_ON_ONCE(!dma_len); desc->va = cpu_to_be64(dma_addr); desc->key = cpu_to_be32(rkey); desc->len = cpu_to_be32(dma_len); state->total_len += dma_len; state->desc++; state->ndesc++; } static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) { srp_handle_qp_err(cq, wc, "FAST REG"); } /* * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset * where to start in the first element. If sg_offset_p != NULL then * *sg_offset_p is updated to the offset in state->sg[retval] of the first * byte that has not yet been mapped. */ static int srp_map_finish_fr(struct srp_map_state *state, struct srp_request *req, struct srp_rdma_ch *ch, int sg_nents, unsigned int *sg_offset_p) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; struct ib_reg_wr wr; struct srp_fr_desc *desc; u32 rkey; int n, err; if (state->fr.next >= state->fr.end) { shost_printk(KERN_ERR, ch->target->scsi_host, PFX "Out of MRs (mr_per_cmd = %d)\n", ch->target->mr_per_cmd); return -ENOMEM; } WARN_ON_ONCE(!dev->use_fast_reg); if (sg_nents == 1 && target->global_rkey) { unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; srp_map_desc(state, sg_dma_address(state->sg) + sg_offset, sg_dma_len(state->sg) - sg_offset, target->global_rkey); if (sg_offset_p) *sg_offset_p = 0; return 1; } desc = srp_fr_pool_get(ch->fr_pool); if (!desc) return -ENOMEM; rkey = ib_inc_rkey(desc->mr->rkey); ib_update_fast_reg_key(desc->mr, rkey); n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p, dev->mr_page_size); if (unlikely(n < 0)) { srp_fr_pool_put(ch->fr_pool, &desc, 1); pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n", dev_name(&req->scmnd->device->sdev_gendev), sg_nents, sg_offset_p ? *sg_offset_p : -1, n); return n; } WARN_ON_ONCE(desc->mr->length == 0); req->reg_cqe.done = srp_reg_mr_err_done; wr.wr.next = NULL; wr.wr.opcode = IB_WR_REG_MR; wr.wr.wr_cqe = &req->reg_cqe; wr.wr.num_sge = 0; wr.wr.send_flags = 0; wr.mr = desc->mr; wr.key = desc->mr->rkey; wr.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE); *state->fr.next++ = desc; state->nmdesc++; srp_map_desc(state, desc->mr->iova, desc->mr->length, desc->mr->rkey); err = ib_post_send(ch->qp, &wr.wr, NULL); if (unlikely(err)) { WARN_ON_ONCE(err == -ENOMEM); return err; } return n; } static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { unsigned int sg_offset = 0; state->fr.next = req->fr_list; state->fr.end = req->fr_list + ch->target->mr_per_cmd; state->sg = scat; if (count == 0) return 0; while (count) { int i, n; n = srp_map_finish_fr(state, req, ch, count, &sg_offset); if (unlikely(n < 0)) return n; count -= n; for (i = 0; i < n; i++) state->sg = sg_next(state->sg); } return 0; } static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { struct srp_target_port *target = ch->target; struct scatterlist *sg; int i; for_each_sg(scat, sg, count, i) { srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg), target->global_rkey); } return 0; } /* * Register the indirect data buffer descriptor with the HCA. * * Note: since the indirect data buffer descriptor has been allocated with * kmalloc() it is guaranteed that this buffer is a physically contiguous * memory buffer. */ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, void **next_mr, void **end_mr, u32 idb_len, __be32 *idb_rkey) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; struct srp_map_state state; struct srp_direct_buf idb_desc; struct scatterlist idb_sg[1]; int ret; memset(&state, 0, sizeof(state)); memset(&idb_desc, 0, sizeof(idb_desc)); state.gen.next = next_mr; state.gen.end = end_mr; state.desc = &idb_desc; state.base_dma_addr = req->indirect_dma_addr; state.dma_len = idb_len; if (dev->use_fast_reg) { state.sg = idb_sg; sg_init_one(idb_sg, req->indirect_desc, idb_len); idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif ret = srp_map_finish_fr(&state, req, ch, 1, NULL); if (ret < 0) return ret; WARN_ON_ONCE(ret < 1); } else { return -EINVAL; } *idb_rkey = idb_desc.key; return 0; } static void srp_check_mapping(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { struct srp_device *dev = ch->target->srp_host->srp_dev; struct srp_fr_desc **pfr; u64 desc_len = 0, mr_len = 0; int i; for (i = 0; i < state->ndesc; i++) desc_len += be32_to_cpu(req->indirect_desc[i].len); if (dev->use_fast_reg) for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) mr_len += (*pfr)->mr->length; if (desc_len != scsi_bufflen(req->scmnd) || mr_len > scsi_bufflen(req->scmnd)) pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", scsi_bufflen(req->scmnd), desc_len, mr_len, state->ndesc, state->nmdesc); } /** * srp_map_data() - map SCSI data buffer onto an SRP request * @scmnd: SCSI command to map * @ch: SRP RDMA channel * @req: SRP request * * Returns the length in bytes of the SRP_CMD IU or a negative value if * mapping failed. The size of any immediate data is not included in the * return value. */ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_request *req) { struct srp_target_port *target = ch->target; struct scatterlist *scat, *sg; struct srp_cmd *cmd = req->cmd->buf; int i, len, nents, count, ret; struct srp_device *dev; struct ib_device *ibdev; struct srp_map_state state; struct srp_indirect_buf *indirect_hdr; u64 data_len; u32 idb_len, table_len; __be32 idb_rkey; u8 fmt; req->cmd->num_sge = 1; if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) return sizeof(struct srp_cmd) + cmd->add_cdb_len; if (scmnd->sc_data_direction != DMA_FROM_DEVICE && scmnd->sc_data_direction != DMA_TO_DEVICE) { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled data direction %d\n", scmnd->sc_data_direction); return -EINVAL; } nents = scsi_sg_count(scmnd); scat = scsi_sglist(scmnd); data_len = scsi_bufflen(scmnd); dev = target->srp_host->srp_dev; ibdev = dev->dev; count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); if (unlikely(count == 0)) return -EIO; if (ch->use_imm_data && count <= ch->max_imm_sge && SRP_IMM_DATA_OFFSET + data_len <= ch->max_it_iu_len && scmnd->sc_data_direction == DMA_TO_DEVICE) { struct srp_imm_buf *buf; struct ib_sge *sge = &req->cmd->sge[1]; fmt = SRP_DATA_DESC_IMM; len = SRP_IMM_DATA_OFFSET; req->nmdesc = 0; buf = (void *)cmd->add_data + cmd->add_cdb_len; buf->len = cpu_to_be32(data_len); WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len); for_each_sg(scat, sg, count, i) { sge[i].addr = sg_dma_address(sg); sge[i].length = sg_dma_len(sg); sge[i].lkey = target->lkey; } req->cmd->num_sge += count; goto map_complete; } fmt = SRP_DATA_DESC_DIRECT; len = sizeof(struct srp_cmd) + cmd->add_cdb_len + sizeof(struct srp_direct_buf); if (count == 1 && target->global_rkey) { /* * The midlayer only generated a single gather/scatter * entry, or DMA mapping coalesced everything to a * single entry. So a direct descriptor along with * the DMA MR suffices. */ struct srp_direct_buf *buf; buf = (void *)cmd->add_data + cmd->add_cdb_len; buf->va = cpu_to_be64(sg_dma_address(scat)); buf->key = cpu_to_be32(target->global_rkey); buf->len = cpu_to_be32(sg_dma_len(scat)); req->nmdesc = 0; goto map_complete; } /* * We have more than one scatter/gather entry, so build our indirect * descriptor table, trying to merge as many entries as we can. */ indirect_hdr = (void *)cmd->add_data + cmd->add_cdb_len; ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); state.desc = req->indirect_desc; if (dev->use_fast_reg) ret = srp_map_sg_fr(&state, ch, req, scat, count); else ret = srp_map_sg_dma(&state, ch, req, scat, count); req->nmdesc = state.nmdesc; if (ret < 0) goto unmap; { DEFINE_DYNAMIC_DEBUG_METADATA(ddm, "Memory mapping consistency check"); if (DYNAMIC_DEBUG_BRANCH(ddm)) srp_check_mapping(&state, ch, req, scat, count); } /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this * target is not using an external indirect table, we are * guaranteed to fit into the command, as the SCSI layer won't * give us more S/G entries than we allow. */ if (state.ndesc == 1) { /* * Memory registration collapsed the sg-list into one entry, * so use a direct descriptor. */ struct srp_direct_buf *buf; buf = (void *)cmd->add_data + cmd->add_cdb_len; *buf = req->indirect_desc[0]; goto map_complete; } if (unlikely(target->cmd_sg_cnt < state.ndesc && !target->allow_ext_sg)) { shost_printk(KERN_ERR, target->scsi_host, "Could not fit S/G list into SRP_CMD\n"); ret = -EIO; goto unmap; } count = min(state.ndesc, target->cmd_sg_cnt); table_len = state.ndesc * sizeof (struct srp_direct_buf); idb_len = sizeof(struct srp_indirect_buf) + table_len; fmt = SRP_DATA_DESC_INDIRECT; len = sizeof(struct srp_cmd) + cmd->add_cdb_len + sizeof(struct srp_indirect_buf); len += count * sizeof (struct srp_direct_buf); memcpy(indirect_hdr->desc_list, req->indirect_desc, count * sizeof (struct srp_direct_buf)); if (!target->global_rkey) { ret = srp_map_idb(ch, req, state.gen.next, state.gen.end, idb_len, &idb_rkey); if (ret < 0) goto unmap; req->nmdesc++; } else { idb_rkey = cpu_to_be32(target->global_rkey); } indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); indirect_hdr->table_desc.key = idb_rkey; indirect_hdr->table_desc.len = cpu_to_be32(table_len); indirect_hdr->len = cpu_to_be32(state.total_len); if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->data_out_desc_cnt = count; else cmd->data_in_desc_cnt = count; ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, DMA_TO_DEVICE); map_complete: if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->buf_fmt = fmt << 4; else cmd->buf_fmt = fmt; return len; unmap: srp_unmap_data(scmnd, ch, req); if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size) ret = -E2BIG; return ret; } /* * Return an IU and possible credit to the free pool */ static void srp_put_tx_iu(struct srp_rdma_ch *ch, struct srp_iu *iu, enum srp_iu_type iu_type) { unsigned long flags; spin_lock_irqsave(&ch->lock, flags); list_add(&iu->list, &ch->free_tx); if (iu_type != SRP_IU_RSP) ++ch->req_lim; spin_unlock_irqrestore(&ch->lock, flags); } /* * Must be called with ch->lock held to protect req_lim and free_tx. * If IU is not sent, it must be returned using srp_put_tx_iu(). * * Note: * An upper limit for the number of allocated information units for each * request type is: * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues * more than Scsi_Host.can_queue requests. * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than * one unanswered SRP request to an initiator. */ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, enum srp_iu_type iu_type) { struct srp_target_port *target = ch->target; s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; struct srp_iu *iu; lockdep_assert_held(&ch->lock); ib_process_cq_direct(ch->send_cq, -1); if (list_empty(&ch->free_tx)) return NULL; /* Initiator responses to target requests do not consume credits */ if (iu_type != SRP_IU_RSP) { if (ch->req_lim <= rsv) { ++target->zero_req_lim; return NULL; } --ch->req_lim; } iu = list_first_entry(&ch->free_tx, struct srp_iu, list); list_del(&iu->list); return iu; } /* * Note: if this function is called from inside ib_drain_sq() then it will * be called without ch->lock being held. If ib_drain_sq() dequeues a WQE * with status IB_WC_SUCCESS then that's a bug. */ static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); struct srp_rdma_ch *ch = cq->cq_context; if (unlikely(wc->status != IB_WC_SUCCESS)) { srp_handle_qp_err(cq, wc, "SEND"); return; } lockdep_assert_held(&ch->lock); list_add(&iu->list, &ch->free_tx); } /** * srp_post_send() - send an SRP information unit * @ch: RDMA channel over which to send the information unit. * @iu: Information unit to send. * @len: Length of the information unit excluding immediate data. */ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) { struct srp_target_port *target = ch->target; struct ib_send_wr wr; if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE)) return -EINVAL; iu->sge[0].addr = iu->dma; iu->sge[0].length = len; iu->sge[0].lkey = target->lkey; iu->cqe.done = srp_send_done; wr.next = NULL; wr.wr_cqe = &iu->cqe; wr.sg_list = &iu->sge[0]; wr.num_sge = iu->num_sge; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; return ib_post_send(ch->qp, &wr, NULL); } static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu) { struct srp_target_port *target = ch->target; struct ib_recv_wr wr; struct ib_sge list; list.addr = iu->dma; list.length = iu->size; list.lkey = target->lkey; iu->cqe.done = srp_recv_done; wr.next = NULL; wr.wr_cqe = &iu->cqe; wr.sg_list = &list; wr.num_sge = 1; return ib_post_recv(ch->qp, &wr, NULL); } static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) { struct srp_target_port *target = ch->target; struct srp_request *req; struct scsi_cmnd *scmnd; unsigned long flags; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { spin_lock_irqsave(&ch->lock, flags); ch->req_lim += be32_to_cpu(rsp->req_lim_delta); if (rsp->tag == ch->tsk_mgmt_tag) { ch->tsk_mgmt_status = -1; if (be32_to_cpu(rsp->resp_data_len) >= 4) ch->tsk_mgmt_status = rsp->data[3]; complete(&ch->tsk_mgmt_done); } else { shost_printk(KERN_ERR, target->scsi_host, "Received tsk mgmt response too late for tag %#llx\n", rsp->tag); } spin_unlock_irqrestore(&ch->lock, flags); } else { scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag); if (scmnd) { req = scsi_cmd_priv(scmnd); scmnd = srp_claim_req(ch, req, NULL, scmnd); } if (!scmnd) { shost_printk(KERN_ERR, target->scsi_host, "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n", rsp->tag, ch - target->ch, ch->qp->qp_num); spin_lock_irqsave(&ch->lock, flags); ch->req_lim += be32_to_cpu(rsp->req_lim_delta); spin_unlock_irqrestore(&ch->lock, flags); return; } scmnd->result = rsp->status; if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { memcpy(scmnd->sense_buffer, rsp->data + be32_to_cpu(rsp->resp_data_len), min_t(int, be32_to_cpu(rsp->sense_data_len), SCSI_SENSE_BUFFERSIZE)); } if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); srp_free_req(ch, req, scmnd, be32_to_cpu(rsp->req_lim_delta)); scsi_done(scmnd); } } static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta, void *rsp, int len) { struct srp_target_port *target = ch->target; struct ib_device *dev = target->srp_host->srp_dev->dev; unsigned long flags; struct srp_iu *iu; int err; spin_lock_irqsave(&ch->lock, flags); ch->req_lim += req_delta; iu = __srp_get_tx_iu(ch, SRP_IU_RSP); spin_unlock_irqrestore(&ch->lock, flags); if (!iu) { shost_printk(KERN_ERR, target->scsi_host, PFX "no IU available to send response\n"); return 1; } iu->num_sge = 1; ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); memcpy(iu->buf, rsp, len); ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); err = srp_post_send(ch, iu, len); if (err) { shost_printk(KERN_ERR, target->scsi_host, PFX "unable to post response: %d\n", err); srp_put_tx_iu(ch, iu, SRP_IU_RSP); } return err; } static void srp_process_cred_req(struct srp_rdma_ch *ch, struct srp_cred_req *req) { struct srp_cred_rsp rsp = { .opcode = SRP_CRED_RSP, .tag = req->tag, }; s32 delta = be32_to_cpu(req->req_lim_delta); if (srp_response_common(ch, delta, &rsp, sizeof(rsp))) shost_printk(KERN_ERR, ch->target->scsi_host, PFX "problems processing SRP_CRED_REQ\n"); } static void srp_process_aer_req(struct srp_rdma_ch *ch, struct srp_aer_req *req) { struct srp_target_port *target = ch->target; struct srp_aer_rsp rsp = { .opcode = SRP_AER_RSP, .tag = req->tag, }; s32 delta = be32_to_cpu(req->req_lim_delta); shost_printk(KERN_ERR, target->scsi_host, PFX "ignoring AER for LUN %llu\n", scsilun_to_int(&req->lun)); if (srp_response_common(ch, delta, &rsp, sizeof(rsp))) shost_printk(KERN_ERR, target->scsi_host, PFX "problems processing SRP_AER_REQ\n"); } static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); struct srp_rdma_ch *ch = cq->cq_context; struct srp_target_port *target = ch->target; struct ib_device *dev = target->srp_host->srp_dev->dev; int res; u8 opcode; if (unlikely(wc->status != IB_WC_SUCCESS)) { srp_handle_qp_err(cq, wc, "RECV"); return; } ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len, DMA_FROM_DEVICE); opcode = *(u8 *) iu->buf; if (0) { shost_printk(KERN_ERR, target->scsi_host, PFX "recv completion, opcode 0x%02x\n", opcode); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1, iu->buf, wc->byte_len, true); } switch (opcode) { case SRP_RSP: srp_process_rsp(ch, iu->buf); break; case SRP_CRED_REQ: srp_process_cred_req(ch, iu->buf); break; case SRP_AER_REQ: srp_process_aer_req(ch, iu->buf); break; case SRP_T_LOGOUT: /* XXX Handle target logout */ shost_printk(KERN_WARNING, target->scsi_host, PFX "Got target logout request\n"); break; default: shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled SRP opcode 0x%02x\n", opcode); break; } ib_dma_sync_single_for_device(dev, iu->dma, ch->max_ti_iu_len, DMA_FROM_DEVICE); res = srp_post_recv(ch, iu); if (res != 0) shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed with error code %d\n", res); } /** * srp_tl_err_work() - handle a transport layer error * @work: Work structure embedded in an SRP target port. * * Note: This function may get invoked before the rport has been created, * hence the target->rport test. */ static void srp_tl_err_work(struct work_struct *work) { struct srp_target_port *target; target = container_of(work, struct srp_target_port, tl_err_work); if (target->rport) srp_start_tl_fail_timers(target->rport); } static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, const char *opname) { struct srp_rdma_ch *ch = cq->cq_context; struct srp_target_port *target = ch->target; if (ch->connected && !target->qp_in_error) { shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %s (%d) for CQE %p\n", opname, ib_wc_status_msg(wc->status), wc->status, wc->wr_cqe); queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; } static enum scsi_qc_status srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct request *rq = scsi_cmd_to_rq(scmnd); struct srp_target_port *target = host_to_target(shost); struct srp_rdma_ch *ch; struct srp_request *req = scsi_cmd_priv(scmnd); struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; u32 tag; int len, ret; scmnd->result = srp_chkready(target->rport); if (unlikely(scmnd->result)) goto err; WARN_ON_ONCE(rq->tag < 0); tag = blk_mq_unique_tag(rq); ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; spin_lock_irqsave(&ch->lock, flags); iu = __srp_get_tx_iu(ch, SRP_IU_CMD); spin_unlock_irqrestore(&ch->lock, flags); if (!iu) goto err; dev = target->srp_host->srp_dev->dev; ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len, DMA_TO_DEVICE); cmd = iu->buf; memset(cmd, 0, sizeof *cmd); cmd->opcode = SRP_CMD; int_to_scsilun(scmnd->device->lun, &cmd->lun); cmd->tag = tag; memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); if (unlikely(scmnd->cmd_len > sizeof(cmd->cdb))) { cmd->add_cdb_len = round_up(scmnd->cmd_len - sizeof(cmd->cdb), 4); if (WARN_ON_ONCE(cmd->add_cdb_len > SRP_MAX_ADD_CDB_LEN)) goto err_iu; } req->scmnd = scmnd; req->cmd = iu; len = srp_map_data(scmnd, ch, req); if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, PFX "Failed to map data (%d)\n", len); /* * If we ran out of memory descriptors (-ENOMEM) because an * application is queuing many requests with more than * max_pages_per_mr sg-list elements, tell the SCSI mid-layer * to reduce queue depth temporarily. */ scmnd->result = len == -ENOMEM ? DID_OK << 16 | SAM_STAT_TASK_SET_FULL : DID_ERROR << 16; goto err_iu; } ib_dma_sync_single_for_device(dev, iu->dma, ch->max_it_iu_len, DMA_TO_DEVICE); if (srp_post_send(ch, iu, len)) { shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); scmnd->result = DID_ERROR << 16; goto err_unmap; } return 0; err_unmap: srp_unmap_data(scmnd, ch, req); err_iu: srp_put_tx_iu(ch, iu, SRP_IU_CMD); /* * Avoid that the loops that iterate over the request ring can * encounter a dangling SCSI command pointer. */ req->scmnd = NULL; err: if (scmnd->result) { scsi_done(scmnd); ret = 0; } else { ret = SCSI_MLQUEUE_HOST_BUSY; } return ret; } /* * Note: the resources allocated in this function are freed in * srp_free_ch_ib(). */ static int srp_alloc_iu_bufs(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; int i; ch->rx_ring = kzalloc_objs(*ch->rx_ring, target->queue_size); if (!ch->rx_ring) goto err_no_ring; ch->tx_ring = kzalloc_objs(*ch->tx_ring, target->queue_size); if (!ch->tx_ring) goto err_no_ring; for (i = 0; i < target->queue_size; ++i) { ch->rx_ring[i] = srp_alloc_iu(target->srp_host, ch->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); if (!ch->rx_ring[i]) goto err; } for (i = 0; i < target->queue_size; ++i) { ch->tx_ring[i] = srp_alloc_iu(target->srp_host, ch->max_it_iu_len, GFP_KERNEL, DMA_TO_DEVICE); if (!ch->tx_ring[i]) goto err; list_add(&ch->tx_ring[i]->list, &ch->free_tx); } return 0; err: for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, ch->rx_ring[i]); srp_free_iu(target->srp_host, ch->tx_ring[i]); } err_no_ring: kfree(ch->tx_ring); ch->tx_ring = NULL; kfree(ch->rx_ring); ch->rx_ring = NULL; return -ENOMEM; } static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) { uint64_t T_tr_ns, max_compl_time_ms; uint32_t rq_tmo_jiffies; /* * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, * table 91), both the QP timeout and the retry count have to be set * for RC QP's during the RTR to RTS transition. */ WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); /* * Set target->rq_tmo_jiffies to one second more than the largest time * it can take before an error completion is generated. See also * C9-140..142 in the IBTA spec for more information about how to * convert the QP Local ACK Timeout value to nanoseconds. */ T_tr_ns = 4096 * (1ULL << qp_attr->timeout); max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; do_div(max_compl_time_ms, NSEC_PER_MSEC); rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); return rq_tmo_jiffies; } static void srp_cm_rep_handler(struct ib_cm_id *cm_id, const struct srp_login_rsp *lrsp, struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct ib_qp_attr *qp_attr = NULL; int attr_mask = 0; int ret = 0; int i; if (lrsp->opcode == SRP_LOGIN_RSP) { ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); ch->req_lim = be32_to_cpu(lrsp->req_lim_delta); ch->use_imm_data = srp_use_imm_data && (lrsp->rsp_flags & SRP_LOGIN_RSP_IMMED_SUPP); ch->max_it_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, ch->use_imm_data, target->max_it_iu_size); WARN_ON_ONCE(ch->max_it_iu_len > be32_to_cpu(lrsp->max_it_iu_len)); if (ch->use_imm_data) shost_printk(KERN_DEBUG, target->scsi_host, PFX "using immediate data\n"); /* * Reserve credits for task management so we don't * bounce requests back to the SCSI mid-layer. */ target->scsi_host->can_queue = min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE, target->scsi_host->can_queue); target->scsi_host->cmd_per_lun = min_t(int, target->scsi_host->can_queue, target->scsi_host->cmd_per_lun); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); ret = -ECONNRESET; goto error; } if (!ch->rx_ring) { ret = srp_alloc_iu_bufs(ch); if (ret) goto error; } for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = ch->rx_ring[i]; ret = srp_post_recv(ch, iu); if (ret) goto error; } if (!target->using_rdma_cm) { ret = -ENOMEM; qp_attr = kmalloc_obj(*qp_attr); if (!qp_attr) goto error; qp_attr->qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); if (ret) goto error_free; ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); if (ret) goto error_free; qp_attr->qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); if (ret) goto error_free; target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); if (ret) goto error_free; ret = ib_send_cm_rtu(cm_id, NULL, 0); } error_free: kfree(qp_attr); error: ch->status = ret; } static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *event, struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct Scsi_Host *shost = target->scsi_host; struct ib_class_port_info *cpi; int opcode; u16 dlid; switch (event->param.rej_rcvd.reason) { case IB_CM_REJ_PORT_CM_REDIRECT: cpi = event->param.rej_rcvd.ari; dlid = be16_to_cpu(cpi->redirect_lid); sa_path_set_dlid(&ch->ib_cm.path, dlid); ch->ib_cm.path.pkey = cpi->redirect_pkey; cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16); ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; break; case IB_CM_REJ_PORT_REDIRECT: if (srp_target_is_topspin(target)) { union ib_gid *dgid = &ch->ib_cm.path.dgid; /* * Topspin/Cisco SRP gateways incorrectly send * reject reason code 25 when they mean 24 * (port redirect). */ memcpy(dgid->raw, event->param.rej_rcvd.ari, 16); shost_printk(KERN_DEBUG, shost, PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", be64_to_cpu(dgid->global.subnet_prefix), be64_to_cpu(dgid->global.interface_id)); ch->status = SRP_PORT_REDIRECT; } else { shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); ch->status = -ECONNRESET; } break; case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); ch->status = -ECONNRESET; break; case IB_CM_REJ_CONSUMER_DEFINED: opcode = *(u8 *) event->private_data; if (opcode == SRP_LOGIN_REJ) { struct srp_login_rej *rej = event->private_data; u32 reason = be32_to_cpu(rej->reason); if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) shost_printk(KERN_WARNING, shost, PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else shost_printk(KERN_WARNING, shost, PFX "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", target->sgid.raw, target->ib_cm.orig_dgid.raw, reason); } else shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," " opcode 0x%02x\n", opcode); ch->status = -ECONNRESET; break; case IB_CM_REJ_STALE_CONN: shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); ch->status = SRP_STALE_CONN; break; default: shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", event->param.rej_rcvd.reason); ch->status = -ECONNRESET; } } static int srp_ib_cm_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *event) { struct srp_rdma_ch *ch = cm_id->context; struct srp_target_port *target = ch->target; int comp = 0; switch (event->event) { case IB_CM_REQ_ERROR: shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM REQ failed\n"); comp = 1; ch->status = -ECONNRESET; break; case IB_CM_REP_RECEIVED: comp = 1; srp_cm_rep_handler(cm_id, event->private_data, ch); break; case IB_CM_REJ_RECEIVED: shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; srp_ib_cm_rej_handler(cm_id, event, ch); break; case IB_CM_DREQ_RECEIVED: shost_printk(KERN_WARNING, target->scsi_host, PFX "DREQ received - connection closed\n"); ch->connected = false; if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); comp = 1; ch->status = 0; break; case IB_CM_MRA_RECEIVED: case IB_CM_DREQ_ERROR: case IB_CM_DREP_RECEIVED: break; default: shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled CM event %d\n", event->event); break; } if (comp) complete(&ch->done); return 0; } static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch, struct rdma_cm_event *event) { struct srp_target_port *target = ch->target; struct Scsi_Host *shost = target->scsi_host; int opcode; switch (event->status) { case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); ch->status = -ECONNRESET; break; case IB_CM_REJ_CONSUMER_DEFINED: opcode = *(u8 *) event->param.conn.private_data; if (opcode == SRP_LOGIN_REJ) { struct srp_login_rej *rej = (struct srp_login_rej *) event->param.conn.private_data; u32 reason = be32_to_cpu(rej->reason); if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) shost_printk(KERN_WARNING, shost, PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); else shost_printk(KERN_WARNING, shost, PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); } else { shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n", opcode); } ch->status = -ECONNRESET; break; case IB_CM_REJ_STALE_CONN: shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); ch->status = SRP_STALE_CONN; break; default: shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", event->status); ch->status = -ECONNRESET; break; } } static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct srp_rdma_ch *ch = cm_id->context; struct srp_target_port *target = ch->target; int comp = 0; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: ch->status = 0; comp = 1; break; case RDMA_CM_EVENT_ADDR_ERROR: ch->status = -ENXIO; comp = 1; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: ch->status = 0; comp = 1; break; case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_UNREACHABLE: ch->status = -EHOSTUNREACH; comp = 1; break; case RDMA_CM_EVENT_CONNECT_ERROR: shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM REQ failed\n"); comp = 1; ch->status = -ECONNRESET; break; case RDMA_CM_EVENT_ESTABLISHED: comp = 1; srp_cm_rep_handler(NULL, event->param.conn.private_data, ch); break; case RDMA_CM_EVENT_REJECTED: shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; srp_rdma_cm_rej_handler(ch, event); break; case RDMA_CM_EVENT_DISCONNECTED: if (ch->connected) { shost_printk(KERN_WARNING, target->scsi_host, PFX "received DREQ\n"); rdma_disconnect(ch->rdma_cm.cm_id); comp = 1; ch->status = 0; queue_work(system_long_wq, &target->tl_err_work); } break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); comp = 1; ch->status = 0; break; default: shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled CM event %d\n", event->event); break; } if (comp) complete(&ch->done); return 0; } /** * srp_change_queue_depth - setting device queue depth * @sdev: scsi device struct * @qdepth: requested queue depth * * Returns queue depth. */ static int srp_change_queue_depth(struct scsi_device *sdev, int qdepth) { if (!sdev->tagged_supported) qdepth = 1; return scsi_change_queue_depth(sdev, qdepth); } static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, u8 func, u8 *status) { struct srp_target_port *target = ch->target; struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; int res; if (!ch->connected || target->qp_in_error) return -1; /* * Lock the rport mutex to avoid that srp_create_ch_ib() is * invoked while a task management function is being sent. */ mutex_lock(&rport->mutex); spin_lock_irq(&ch->lock); iu = __srp_get_tx_iu(ch, SRP_IU_TSK_MGMT); spin_unlock_irq(&ch->lock); if (!iu) { mutex_unlock(&rport->mutex); return -1; } iu->num_sge = 1; ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); tsk_mgmt = iu->buf; memset(tsk_mgmt, 0, sizeof *tsk_mgmt); tsk_mgmt->opcode = SRP_TSK_MGMT; int_to_scsilun(lun, &tsk_mgmt->lun); tsk_mgmt->tsk_mgmt_func = func; tsk_mgmt->task_tag = req_tag; spin_lock_irq(&ch->lock); ch->tsk_mgmt_tag = (ch->tsk_mgmt_tag + 1) | SRP_TAG_TSK_MGMT; tsk_mgmt->tag = ch->tsk_mgmt_tag; spin_unlock_irq(&ch->lock); init_completion(&ch->tsk_mgmt_done); ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); if (srp_post_send(ch, iu, sizeof(*tsk_mgmt))) { srp_put_tx_iu(ch, iu, SRP_IU_TSK_MGMT); mutex_unlock(&rport->mutex); return -1; } res = wait_for_completion_timeout(&ch->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)); if (res > 0 && status) *status = ch->tsk_mgmt_status; mutex_unlock(&rport->mutex); WARN_ON_ONCE(res < 0); return res > 0 ? 0 : -1; } static int srp_abort(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_request *req = scsi_cmd_priv(scmnd); u32 tag; u16 ch_idx; struct srp_rdma_ch *ch; shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); ch_idx = blk_mq_unique_tag_to_hwq(tag); if (WARN_ON_ONCE(ch_idx >= target->ch_count)) return SUCCESS; ch = &target->ch[ch_idx]; if (!srp_claim_req(ch, req, NULL, scmnd)) return SUCCESS; shost_printk(KERN_ERR, target->scsi_host, "Sending SRP abort for tag %#x\n", tag); if (srp_send_tsk_mgmt(ch, tag, scmnd->device->lun, SRP_TSK_ABORT_TASK, NULL) == 0) { srp_free_req(ch, req, scmnd, 0); return SUCCESS; } if (target->rport->state == SRP_RPORT_LOST) return FAST_IO_FAIL; return FAILED; } static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_rdma_ch *ch; u8 status; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); ch = &target->ch[0]; if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, SRP_TSK_LUN_RESET, &status)) return FAILED; if (status) return FAILED; return SUCCESS; } static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } static int srp_target_alloc(struct scsi_target *starget) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct srp_target_port *target = host_to_target(shost); if (target->target_can_queue) starget->can_queue = target->target_can_queue; return 0; } static int srp_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) { struct Scsi_Host *shost = sdev->host; struct srp_target_port *target = host_to_target(shost); struct request_queue *q = sdev->request_queue; unsigned long timeout; if (sdev->type == TYPE_DISK) { timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); blk_queue_rq_timeout(q, timeout); } return 0; } static ssize_t id_ext_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->id_ext)); } static DEVICE_ATTR_RO(id_ext); static ssize_t ioc_guid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->ioc_guid)); } static DEVICE_ATTR_RO(ioc_guid); static ssize_t service_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); if (target->using_rdma_cm) return -ENOENT; return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->ib_cm.service_id)); } static DEVICE_ATTR_RO(service_id); static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); if (target->using_rdma_cm) return -ENOENT; return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey)); } static DEVICE_ATTR_RO(pkey); static ssize_t sgid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%pI6\n", target->sgid.raw); } static DEVICE_ATTR_RO(sgid); static ssize_t dgid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); struct srp_rdma_ch *ch = &target->ch[0]; if (target->using_rdma_cm) return -ENOENT; return sysfs_emit(buf, "%pI6\n", ch->ib_cm.path.dgid.raw); } static DEVICE_ATTR_RO(dgid); static ssize_t orig_dgid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); if (target->using_rdma_cm) return -ENOENT; return sysfs_emit(buf, "%pI6\n", target->ib_cm.orig_dgid.raw); } static DEVICE_ATTR_RO(orig_dgid); static ssize_t req_lim_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); struct srp_rdma_ch *ch; int i, req_lim = INT_MAX; for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; req_lim = min(req_lim, ch->req_lim); } return sysfs_emit(buf, "%d\n", req_lim); } static DEVICE_ATTR_RO(req_lim); static ssize_t zero_req_lim_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%d\n", target->zero_req_lim); } static DEVICE_ATTR_RO(zero_req_lim); static ssize_t local_ib_port_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%u\n", target->srp_host->port); } static DEVICE_ATTR_RO(local_ib_port); static ssize_t local_ib_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%s\n", dev_name(&target->srp_host->srp_dev->dev->dev)); } static DEVICE_ATTR_RO(local_ib_device); static ssize_t ch_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%d\n", target->ch_count); } static DEVICE_ATTR_RO(ch_count); static ssize_t comp_vector_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%d\n", target->comp_vector); } static DEVICE_ATTR_RO(comp_vector); static ssize_t tl_retry_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%d\n", target->tl_retry_count); } static DEVICE_ATTR_RO(tl_retry_count); static ssize_t cmd_sg_entries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%u\n", target->cmd_sg_cnt); } static DEVICE_ATTR_RO(cmd_sg_entries); static ssize_t allow_ext_sg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); return sysfs_emit(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); } static DEVICE_ATTR_RO(allow_ext_sg); static struct attribute *srp_host_attrs[] = { &dev_attr_id_ext.attr, &dev_attr_ioc_guid.attr, &dev_attr_service_id.attr, &dev_attr_pkey.attr, &dev_attr_sgid.attr, &dev_attr_dgid.attr, &dev_attr_orig_dgid.attr, &dev_attr_req_lim.attr, &dev_attr_zero_req_lim.attr, &dev_attr_local_ib_port.attr, &dev_attr_local_ib_device.attr, &dev_attr_ch_count.attr, &dev_attr_comp_vector.attr, &dev_attr_tl_retry_count.attr, &dev_attr_cmd_sg_entries.attr, &dev_attr_allow_ext_sg.attr, NULL }; ATTRIBUTE_GROUPS(srp_host); static const struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, .target_alloc = srp_target_alloc, .sdev_configure = srp_sdev_configure, .info = srp_target_info, .init_cmd_priv = srp_init_cmd_priv, .exit_cmd_priv = srp_exit_cmd_priv, .queuecommand = srp_queuecommand, .change_queue_depth = srp_change_queue_depth, .eh_timed_out = srp_timed_out, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, .skip_settle_delay = true, .sg_tablesize = SRP_DEF_SG_TABLESIZE, .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, .shost_groups = srp_host_groups, .track_queue_depth = 1, .cmd_size = sizeof(struct srp_request), }; static int srp_sdev_count(struct Scsi_Host *host) { struct scsi_device *sdev; int c = 0; shost_for_each_device(sdev, host) c++; return c; } /* * Return values: * < 0 upon failure. Caller is responsible for SRP target port cleanup. * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port * removal has been scheduled. * 0 and target->state != SRP_TARGET_REMOVED upon success. */ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) { struct srp_rport_identifiers ids; struct srp_rport *rport; target->state = SRP_TARGET_SCANNING; sprintf(target->target_name, "SRP.T10:%016llX", be64_to_cpu(target->id_ext)); if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dev.parent)) return -ENODEV; memcpy(ids.port_id, &target->id_ext, 8); memcpy(ids.port_id + 8, &target->ioc_guid, 8); ids.roles = SRP_RPORT_ROLE_TARGET; rport = srp_rport_add(target->scsi_host, &ids); if (IS_ERR(rport)) { scsi_remove_host(target->scsi_host); return PTR_ERR(rport); } rport->lld_data = target; target->rport = rport; spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); spin_unlock(&host->target_lock); scsi_scan_target(&target->scsi_host->shost_gendev, 0, target->scsi_id, SCAN_WILD_CARD, SCSI_SCAN_INITIAL); if (srp_connected_ch(target) < target->ch_count || target->qp_in_error) { shost_printk(KERN_INFO, target->scsi_host, PFX "SCSI scan failed - removing SCSI host\n"); srp_queue_remove_work(target); goto out; } pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n", dev_name(&target->scsi_host->shost_gendev), srp_sdev_count(target->scsi_host)); spin_lock_irq(&target->lock); if (target->state == SRP_TARGET_SCANNING) target->state = SRP_TARGET_LIVE; spin_unlock_irq(&target->lock); out: return 0; } static void srp_release_dev(struct device *dev) { struct srp_host *host = container_of(dev, struct srp_host, dev); kfree(host); } static struct attribute *srp_class_attrs[]; ATTRIBUTE_GROUPS(srp_class); static struct class srp_class = { .name = "infiniband_srp", .dev_groups = srp_class_groups, .dev_release = srp_release_dev }; /** * srp_conn_unique() - check whether the connection to a target is unique * @host: SRP host. * @target: SRP target port. */ static bool srp_conn_unique(struct srp_host *host, struct srp_target_port *target) { struct srp_target_port *t; bool ret = false; if (target->state == SRP_TARGET_REMOVED) goto out; ret = true; spin_lock(&host->target_lock); list_for_each_entry(t, &host->target_list, list) { if (t != target && target->id_ext == t->id_ext && target->ioc_guid == t->ioc_guid && target->initiator_ext == t->initiator_ext) { ret = false; break; } } spin_unlock(&host->target_lock); out: return ret; } /* * Target ports are added by writing * * id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>, * pkey=<P_Key>,service_id=<service ID> * or * id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>, * [src=<IPv4 address>,]dest=<IPv4 address>:<port number> * * to the add_target sysfs attribute. */ enum { SRP_OPT_ERR = 0, SRP_OPT_ID_EXT = 1 << 0, SRP_OPT_IOC_GUID = 1 << 1, SRP_OPT_DGID = 1 << 2, SRP_OPT_PKEY = 1 << 3, SRP_OPT_SERVICE_ID = 1 << 4, SRP_OPT_MAX_SECT = 1 << 5, SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_INITIATOR_EXT = 1 << 8, SRP_OPT_CMD_SG_ENTRIES = 1 << 9, SRP_OPT_ALLOW_EXT_SG = 1 << 10, SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_COMP_VECTOR = 1 << 12, SRP_OPT_TL_RETRY_COUNT = 1 << 13, SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_IP_SRC = 1 << 15, SRP_OPT_IP_DEST = 1 << 16, SRP_OPT_TARGET_CAN_QUEUE= 1 << 17, SRP_OPT_MAX_IT_IU_SIZE = 1 << 18, SRP_OPT_CH_COUNT = 1 << 19, }; static unsigned int srp_opt_mandatory[] = { SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | SRP_OPT_PKEY | SRP_OPT_SERVICE_ID, SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_IP_DEST, }; static const match_table_t srp_opt_tokens = { { SRP_OPT_ID_EXT, "id_ext=%s" }, { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, { SRP_OPT_DGID, "dgid=%s" }, { SRP_OPT_PKEY, "pkey=%x" }, { SRP_OPT_SERVICE_ID, "service_id=%s" }, { SRP_OPT_MAX_SECT, "max_sect=%d" }, { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, { SRP_OPT_TARGET_CAN_QUEUE, "target_can_queue=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, { SRP_OPT_IP_SRC, "src=%s" }, { SRP_OPT_IP_DEST, "dest=%s" }, { SRP_OPT_MAX_IT_IU_SIZE, "max_it_iu_size=%d" }, { SRP_OPT_CH_COUNT, "ch_count=%u", }, { SRP_OPT_ERR, NULL } }; /** * srp_parse_in - parse an IP address and port number combination * @net: [in] Network namespace. * @sa: [out] Address family, IP address and port number. * @addr_port_str: [in] IP address and port number. * @has_port: [out] Whether or not @addr_port_str includes a port number. * * Parse the following address formats: * - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5. * - IPv6: \[<ipv6_address>\]:<port>, e.g. [1::2:3%4]:5. */ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, const char *addr_port_str, bool *has_port) { char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL); char *port_str; int ret; if (!addr) return -ENOMEM; port_str = strrchr(addr, ':'); if (port_str && strchr(port_str, ']')) port_str = NULL; if (port_str) *port_str++ = '\0'; if (has_port) *has_port = port_str != NULL; ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa); if (ret && addr[0]) { addr_end = addr + strlen(addr) - 1; if (addr[0] == '[' && *addr_end == ']') { *addr_end = '\0'; ret = inet_pton_with_scope(net, AF_INET6, addr + 1, port_str, sa); } } kfree(addr); pr_debug("%s -> %pISpfsc\n", addr_port_str, sa); return ret; } static int srp_parse_options(struct net *net, const char *buf, struct srp_target_port *target) { char *options, *sep_opt; char *p; substring_t args[MAX_OPT_ARGS]; unsigned long long ull; bool has_port; int opt_mask = 0; int token; int ret = -EINVAL; int i; options = kstrdup(buf, GFP_KERNEL); if (!options) return -ENOMEM; sep_opt = options; while ((p = strsep(&sep_opt, ",\n")) != NULL) { if (!*p) continue; token = match_token(p, srp_opt_tokens, args); opt_mask |= token; switch (token) { case SRP_OPT_ID_EXT: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = kstrtoull(p, 16, &ull); if (ret) { pr_warn("invalid id_ext parameter '%s'\n", p); kfree(p); goto out; } target->id_ext = cpu_to_be64(ull); kfree(p); break; case SRP_OPT_IOC_GUID: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = kstrtoull(p, 16, &ull); if (ret) { pr_warn("invalid ioc_guid parameter '%s'\n", p); kfree(p); goto out; } target->ioc_guid = cpu_to_be64(ull); kfree(p); break; case SRP_OPT_DGID: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } if (strlen(p) != 32) { pr_warn("bad dest GID parameter '%s'\n", p); kfree(p); goto out; } ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16); kfree(p); if (ret < 0) goto out; break; case SRP_OPT_PKEY: ret = match_hex(args, &token); if (ret) { pr_warn("bad P_Key parameter '%s'\n", p); goto out; } target->ib_cm.pkey = cpu_to_be16(token); break; case SRP_OPT_SERVICE_ID: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = kstrtoull(p, 16, &ull); if (ret) { pr_warn("bad service_id parameter '%s'\n", p); kfree(p); goto out; } target->ib_cm.service_id = cpu_to_be64(ull); kfree(p); break; case SRP_OPT_IP_SRC: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = srp_parse_in(net, &target->rdma_cm.src.ss, p, NULL); if (ret < 0) { pr_warn("bad source parameter '%s'\n", p); kfree(p); goto out; } target->rdma_cm.src_specified = true; kfree(p); break; case SRP_OPT_IP_DEST: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p, &has_port); if (!has_port) ret = -EINVAL; if (ret < 0) { pr_warn("bad dest parameter '%s'\n", p); kfree(p); goto out; } target->using_rdma_cm = true; kfree(p); break; case SRP_OPT_MAX_SECT: ret = match_int(args, &token); if (ret) { pr_warn("bad max sect parameter '%s'\n", p); goto out; } target->scsi_host->max_sectors = token; break; case SRP_OPT_QUEUE_SIZE: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for queue_size parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1) { pr_warn("bad queue_size parameter '%s'\n", p); ret = -EINVAL; goto out; } target->scsi_host->can_queue = token; target->queue_size = token + SRP_RSP_SQ_SIZE + SRP_TSK_MGMT_SQ_SIZE; if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_MAX_CMD_PER_LUN: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for max cmd_per_lun parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); ret = -EINVAL; goto out; } target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_TARGET_CAN_QUEUE: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for max target_can_queue parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1) { pr_warn("bad max target_can_queue parameter '%s'\n", p); ret = -EINVAL; goto out; } target->target_can_queue = token; break; case SRP_OPT_IO_CLASS: ret = match_hex(args, &token); if (ret) { pr_warn("bad IO class parameter '%s'\n", p); goto out; } if (token != SRP_REV10_IB_IO_CLASS && token != SRP_REV16A_IB_IO_CLASS) { pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); ret = -EINVAL; goto out; } target->io_class = token; break; case SRP_OPT_INITIATOR_EXT: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } ret = kstrtoull(p, 16, &ull); if (ret) { pr_warn("bad initiator_ext value '%s'\n", p); kfree(p); goto out; } target->initiator_ext = cpu_to_be64(ull); kfree(p); break; case SRP_OPT_CMD_SG_ENTRIES: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for max cmd_sg_entries parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1 || token > 255) { pr_warn("bad max cmd_sg_entries parameter '%s'\n", p); ret = -EINVAL; goto out; } target->cmd_sg_cnt = token; break; case SRP_OPT_ALLOW_EXT_SG: ret = match_int(args, &token); if (ret) { pr_warn("bad allow_ext_sg parameter '%s'\n", p); goto out; } target->allow_ext_sg = !!token; break; case SRP_OPT_SG_TABLESIZE: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for max sg_tablesize parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1 || token > SG_MAX_SEGMENTS) { pr_warn("bad max sg_tablesize parameter '%s'\n", p); ret = -EINVAL; goto out; } target->sg_tablesize = token; break; case SRP_OPT_COMP_VECTOR: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for comp_vector parameter '%s', Error %d\n", p, ret); goto out; } if (token < 0) { pr_warn("bad comp_vector parameter '%s'\n", p); ret = -EINVAL; goto out; } target->comp_vector = token; break; case SRP_OPT_TL_RETRY_COUNT: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for tl_retry_count parameter '%s', Error %d\n", p, ret); goto out; } if (token < 2 || token > 7) { pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", p); ret = -EINVAL; goto out; } target->tl_retry_count = token; break; case SRP_OPT_MAX_IT_IU_SIZE: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for max it_iu_size parameter '%s', Error %d\n", p, ret); goto out; } if (token < 0) { pr_warn("bad maximum initiator to target IU size '%s'\n", p); ret = -EINVAL; goto out; } target->max_it_iu_size = token; break; case SRP_OPT_CH_COUNT: ret = match_int(args, &token); if (ret) { pr_warn("match_int() failed for channel count parameter '%s', Error %d\n", p, ret); goto out; } if (token < 1) { pr_warn("bad channel count %s\n", p); ret = -EINVAL; goto out; } target->ch_count = token; break; default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); ret = -EINVAL; goto out; } } for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) { if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) { ret = 0; break; } } if (ret) pr_warn("target creation request is missing one or more parameters\n"); if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) pr_warn("cmd_per_lun = %d > queue_size = %d\n", target->scsi_host->cmd_per_lun, target->scsi_host->can_queue); out: kfree(options); return ret; } static ssize_t add_target_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct srp_host *host = container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; struct srp_rdma_ch *ch; struct srp_device *srp_dev = host->srp_dev; struct ib_device *ibdev = srp_dev->dev; int ret, i, ch_idx; unsigned int max_sectors_per_mr, mr_per_cmd = 0; bool multich = false; uint32_t max_iu_len; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); if (!target_host) return -ENOMEM; target_host->transportt = ib_srp_transport_template; target_host->max_channel = 0; target_host->max_id = 1; target_host->max_lun = -1LL; target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; if (ibdev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) target_host->max_segment_size = ib_dma_max_seg_size(ibdev); else target_host->virt_boundary_mask = ~srp_dev->mr_page_mask; target = host_to_target(target_host); target->net = to_net_ns(kobj_ns_grab_current(KOBJ_NS_TYPE_NET)); target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; target->lkey = host->srp_dev->pd->local_dma_lkey; target->global_rkey = host->srp_dev->global_rkey; target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; target->tl_retry_count = 7; target->queue_size = SRP_DEFAULT_QUEUE_SIZE; /* * Avoid that the SCSI host can be removed by srp_remove_target() * before this function returns. */ scsi_host_get(target->scsi_host); ret = mutex_lock_interruptible(&host->add_target_mutex); if (ret < 0) goto put; ret = srp_parse_options(target->net, buf, target); if (ret) goto out; if (!srp_conn_unique(target->srp_host, target)) { if (target->using_rdma_cm) { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%pIS\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), &target->rdma_cm.dst); } else { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), be64_to_cpu(target->initiator_ext)); } ret = -EEXIST; goto out; } if (!srp_dev->has_fr && !target->allow_ext_sg && target->cmd_sg_cnt < target->sg_tablesize) { pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } if (srp_dev->use_fast_reg) { bool gaps_reg = ibdev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG; max_sectors_per_mr = srp_dev->max_pages_per_mr << (ilog2(srp_dev->mr_page_size) - 9); if (!gaps_reg) { /* * FR can only map one HCA page per entry. If the start * address is not aligned on a HCA page boundary two * entries will be used for the head and the tail * although these two entries combined contain at most * one HCA page of data. Hence the "+ 1" in the * calculation below. * * The indirect data buffer descriptor is contiguous * so the memory for that buffer will only be * registered if register_always is true. Hence add * one to mr_per_cmd if register_always has been set. */ mr_per_cmd = register_always + (target->scsi_host->max_sectors + 1 + max_sectors_per_mr - 1) / max_sectors_per_mr; } else { mr_per_cmd = register_always + (target->sg_tablesize + srp_dev->max_pages_per_mr - 1) / srp_dev->max_pages_per_mr; } pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size, max_sectors_per_mr, mr_per_cmd); } target_host->sg_tablesize = target->sg_tablesize; target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd; target->mr_per_cmd = mr_per_cmd; target->indirect_size = target->sg_tablesize * sizeof (struct srp_direct_buf); max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, srp_use_imm_data, target->max_it_iu_size); INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); ret = rdma_query_gid(ibdev, host->port, 0, &target->sgid); if (ret) goto out; ret = -ENOMEM; if (target->ch_count == 0) { target->ch_count = min(ch_count ?: max(4 * num_online_nodes(), ibdev->num_comp_vectors), num_online_cpus()); } target->ch = kzalloc_objs(*target->ch, target->ch_count); if (!target->ch) goto out; for (ch_idx = 0; ch_idx < target->ch_count; ++ch_idx) { ch = &target->ch[ch_idx]; ch->target = target; ch->comp_vector = ch_idx % ibdev->num_comp_vectors; spin_lock_init(&ch->lock); INIT_LIST_HEAD(&ch->free_tx); ret = srp_new_cm_id(ch); if (ret) goto err_disconnect; ret = srp_create_ch_ib(ch); if (ret) goto err_disconnect; ret = srp_connect_ch(ch, max_iu_len, multich); if (ret) { char dst[64]; if (target->using_rdma_cm) snprintf(dst, sizeof(dst), "%pIS", &target->rdma_cm.dst); else snprintf(dst, sizeof(dst), "%pI6", target->ib_cm.orig_dgid.raw); shost_printk(KERN_ERR, target->scsi_host, PFX "Connection %d/%d to %s failed\n", ch_idx, target->ch_count, dst); if (ch_idx == 0) { goto free_ch; } else { srp_free_ch_ib(target, ch); target->ch_count = ch - target->ch; goto connected; } } multich = true; } connected: target->scsi_host->nr_hw_queues = target->ch_count; ret = srp_add_target(host, target); if (ret) goto err_disconnect; if (target->state != SRP_TARGET_REMOVED) { if (target->using_rdma_cm) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %pIS\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), target->sgid.raw, &target->rdma_cm.dst); } else { shost_printk(KERN_DEBUG, target->scsi_host, PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", be64_to_cpu(target->id_ext), be64_to_cpu(target->ioc_guid), be16_to_cpu(target->ib_cm.pkey), be64_to_cpu(target->ib_cm.service_id), target->sgid.raw, target->ib_cm.orig_dgid.raw); } } ret = count; out: mutex_unlock(&host->add_target_mutex); put: scsi_host_put(target->scsi_host); if (ret < 0) { /* * If a call to srp_remove_target() has not been scheduled, * drop the network namespace reference now that was obtained * earlier in this function. */ if (target->state != SRP_TARGET_REMOVED) kobj_ns_drop(KOBJ_NS_TYPE_NET, to_ns_common(target->net)); scsi_host_put(target->scsi_host); } return ret; err_disconnect: srp_disconnect_target(target); free_ch: for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; srp_free_ch_ib(target, ch); } kfree(target->ch); goto out; } static DEVICE_ATTR_WO(add_target); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_host *host = container_of(dev, struct srp_host, dev); return sysfs_emit(buf, "%s\n", dev_name(&host->srp_dev->dev->dev)); } static DEVICE_ATTR_RO(ibdev); static ssize_t port_show(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_host *host = container_of(dev, struct srp_host, dev); return sysfs_emit(buf, "%u\n", host->port); } static DEVICE_ATTR_RO(port); static struct attribute *srp_class_attrs[] = { &dev_attr_add_target.attr, &dev_attr_ibdev.attr, &dev_attr_port.attr, NULL }; static struct srp_host *srp_add_port(struct srp_device *device, u32 port) { struct srp_host *host; host = kzalloc_obj(*host); if (!host) return NULL; INIT_LIST_HEAD(&host->target_list); spin_lock_init(&host->target_lock); mutex_init(&host->add_target_mutex); host->srp_dev = device; host->port = port; device_initialize(&host->dev); host->dev.class = &srp_class; host->dev.parent = device->dev->dev.parent; if (dev_set_name(&host->dev, "srp-%s-%u", dev_name(&device->dev->dev), port)) goto put_host; if (device_add(&host->dev)) goto put_host; return host; put_host: put_device(&host->dev); return NULL; } static void srp_rename_dev(struct ib_device *device, void *client_data) { struct srp_device *srp_dev = client_data; struct srp_host *host, *tmp_host; list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { char name[IB_DEVICE_NAME_MAX + 8]; snprintf(name, sizeof(name), "srp-%s-%u", dev_name(&device->dev), host->port); device_rename(&host->dev, name); } } static int srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *attr = &device->attrs; struct srp_host *host; int mr_page_shift; u32 p; u64 max_pages_per_mr; unsigned int flags = 0; srp_dev = kzalloc_obj(*srp_dev); if (!srp_dev) return -ENOMEM; /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists * out of smaller entries. */ mr_page_shift = max(12, ffs(attr->page_size_cap) - 1); srp_dev->mr_page_size = 1 << mr_page_shift; srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); max_pages_per_mr = attr->max_mr_size; do_div(max_pages_per_mr, srp_dev->mr_page_size); pr_debug("%s: %llu / %u = %llu <> %u\n", __func__, attr->max_mr_size, srp_dev->mr_page_size, max_pages_per_mr, SRP_MAX_PAGES_PER_MR); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); srp_dev->has_fr = (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS); if (!never_register && !srp_dev->has_fr) dev_warn(&device->dev, "FR is not supported\n"); else if (!never_register && attr->max_mr_size >= 2 * srp_dev->mr_page_size) srp_dev->use_fast_reg = srp_dev->has_fr; if (never_register || !register_always || !srp_dev->has_fr) flags |= IB_PD_UNSAFE_GLOBAL_RKEY; if (srp_dev->use_fast_reg) { srp_dev->max_pages_per_mr = min_t(u32, srp_dev->max_pages_per_mr, attr->max_fast_reg_page_list_len); } srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", dev_name(&device->dev), mr_page_shift, attr->max_mr_size, attr->max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); srp_dev->dev = device; srp_dev->pd = ib_alloc_pd(device, flags); if (IS_ERR(srp_dev->pd)) { int ret = PTR_ERR(srp_dev->pd); kfree(srp_dev); return ret; } if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { srp_dev->global_rkey = srp_dev->pd->unsafe_global_rkey; WARN_ON_ONCE(srp_dev->global_rkey == 0); } rdma_for_each_port (device, p) { host = srp_add_port(srp_dev, p); if (host) list_add_tail(&host->list, &srp_dev->dev_list); } ib_set_client_data(device, &srp_client, srp_dev); return 0; } static void srp_remove_one(struct ib_device *device, void *client_data) { struct srp_device *srp_dev; struct srp_host *host, *tmp_host; struct srp_target_port *target; srp_dev = client_data; list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { /* * Remove the add_target sysfs entry so that no new target ports * can be created. */ device_del(&host->dev); /* * Remove all target ports. */ spin_lock(&host->target_lock); list_for_each_entry(target, &host->target_list, list) srp_queue_remove_work(target); spin_unlock(&host->target_lock); /* * srp_queue_remove_work() queues a call to * srp_remove_target(). The latter function cancels * target->tl_err_work so waiting for the remove works to * finish is sufficient. */ flush_workqueue(srp_remove_wq); put_device(&host->dev); } ib_dealloc_pd(srp_dev->pd); kfree(srp_dev); } static struct srp_function_template ib_srp_transport_functions = { .has_rport_state = true, .reset_timer_if_blocked = true, .reconnect_delay = &srp_reconnect_delay, .fast_io_fail_tmo = &srp_fast_io_fail_tmo, .dev_loss_tmo = &srp_dev_loss_tmo, .reconnect = srp_rport_reconnect, .rport_delete = srp_rport_delete, .terminate_rport_io = srp_terminate_io, }; static int __init srp_init_module(void) { int ret; BUILD_BUG_ON(sizeof(struct srp_aer_req) != 36); BUILD_BUG_ON(sizeof(struct srp_cmd) != 48); BUILD_BUG_ON(sizeof(struct srp_imm_buf) != 4); BUILD_BUG_ON(sizeof(struct srp_indirect_buf) != 20); BUILD_BUG_ON(sizeof(struct srp_login_req) != 64); BUILD_BUG_ON(sizeof(struct srp_login_req_rdma) != 56); BUILD_BUG_ON(sizeof(struct srp_rsp) != 36); if (srp_sg_tablesize) { pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); if (!cmd_sg_entries) cmd_sg_entries = srp_sg_tablesize; } if (!cmd_sg_entries) cmd_sg_entries = SRP_DEF_SG_TABLESIZE; if (cmd_sg_entries > 255) { pr_warn("Clamping cmd_sg_entries to 255\n"); cmd_sg_entries = 255; } if (!indirect_sg_entries) indirect_sg_entries = cmd_sg_entries; else if (indirect_sg_entries < cmd_sg_entries) { pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", cmd_sg_entries); indirect_sg_entries = cmd_sg_entries; } if (indirect_sg_entries > SG_MAX_SEGMENTS) { pr_warn("Clamping indirect_sg_entries to %u\n", SG_MAX_SEGMENTS); indirect_sg_entries = SG_MAX_SEGMENTS; } srp_remove_wq = create_workqueue("srp_remove"); if (!srp_remove_wq) { ret = -ENOMEM; goto out; } ret = -ENOMEM; ib_srp_transport_template = srp_attach_transport(&ib_srp_transport_functions); if (!ib_srp_transport_template) goto destroy_wq; ret = class_register(&srp_class); if (ret) { pr_err("couldn't register class infiniband_srp\n"); goto release_tr; } ib_sa_register_client(&srp_sa_client); ret = ib_register_client(&srp_client); if (ret) { pr_err("couldn't register IB client\n"); goto unreg_sa; } out: return ret; unreg_sa: ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); release_tr: srp_release_transport(ib_srp_transport_template); destroy_wq: destroy_workqueue(srp_remove_wq); goto out; } static void __exit srp_cleanup_module(void) { ib_unregister_client(&srp_client); ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); srp_release_transport(ib_srp_transport_template); destroy_workqueue(srp_remove_wq); } module_init(srp_init_module); module_exit(srp_cleanup_module);
1 1 1 2 6 6 2 2 2 1 2 1 1 1 1 1 1 1 1 1 5 1 4 5 5 5 5 1 5 5 1 4 1 1 5 5 5 4 5 4 1 1 1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } bool devlink_rate_is_node(const struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) { struct nlattr *nla_tc_bw; int i; for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); if (!nla_tc_bw) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); } return 0; nla_put_failure: nla_nest_cancel(msg, nla_tc_bw); return -EMSGSIZE; } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } if (test_and_set_bit(tc_index, bitmap)) { NL_SET_ERR_MSG_FMT(extack, "Duplicate traffic class index specified (%u)", tc_index); return -EINVAL; } tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, struct genl_info *info) { DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; struct devlink *devlink = devlink_rate->devlink; const struct devlink_ops *ops = devlink->ops; u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; int rem, err = -EOPNOTSUPP, i; struct nlattr *attr; nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, GENL_HDRLEN, rem) { err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, info->extack); if (err) return err; } for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { if (!test_bit(i, bitmap)) { NL_SET_ERR_MSG_FMT(info->extack, "Bandwidth values must be specified for all %u traffic classes", DEVLINK_RATE_TCS_MAX); return -EINVAL; } } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); if (err) return err; memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { err = devlink_nl_rate_tc_bw_set(devlink_rate, info); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_leaf_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_node_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc_obj(*rate_node); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rates_check(struct devlink *devlink, bool (*rate_filter)(const struct devlink_rate *), struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (!rate_filter || rate_filter(devlink_rate)) { if (extack) NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc_obj(*rate_node); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc_obj(*devlink_rate); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { const struct devlink_ops *ops = devlink->ops; struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); refcount_dec(&devlink_rate->parent->refcnt); devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
543 541 543 543 444 534 534 438 535 117 24 24 3 3 536 541 532 443 1 443 109 2 106 359 537 21 1 1 1 1 1 113 2 111 112 111 1 4 1 3 4 3 1 3 1 2 3 2 1 43 1 43 43 43 43 42 254 258 45 251 21 18 16 17 3 21 44 43 1 34 1 33 1 32 34 32 1 31 1 30 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem parameter parser. * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/namei.h> #include "internal.h" static const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, { "no", false }, { "true", true }, { "yes", true }, { }, }; static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) { for ( ; tbl->name; tbl++) if (strcmp(name, tbl->name) == 0) return tbl; return NULL; } /** * lookup_constant - Look up a constant by name in an ordered table * @tbl: The table of constants to search. * @name: The name to look up. * @not_found: The value to return if the name is not found. */ int lookup_constant(const struct constant_table *tbl, const char *name, int not_found) { const struct constant_table *p = __lookup_constant(tbl, name); return p ? p->value : not_found; } EXPORT_SYMBOL(lookup_constant); static inline bool is_flag(const struct fs_parameter_spec *p) { return p->type == NULL; } static const struct fs_parameter_spec *fs_lookup_key( const struct fs_parameter_spec *desc, struct fs_parameter *param, bool *negated) { const struct fs_parameter_spec *p, *other = NULL; const char *name = param->key; bool want_flag = param->type == fs_value_is_flag; *negated = false; for (p = desc; p->name; p++) { if (strcmp(p->name, name) != 0) continue; if (likely(is_flag(p) == want_flag)) return p; other = p; } if (want_flag) { if (name[0] == 'n' && name[1] == 'o' && name[2]) { for (p = desc; p->name; p++) { if (strcmp(p->name, name + 2) != 0) continue; if (!(p->flags & fs_param_neg_with_no)) continue; *negated = true; return p; } } } return other; } /* * __fs_parse - Parse a filesystem configuration parameter * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse * * Parse a filesystem configuration parameter and attempt a conversion for a * simple parameter for which this is requested. If successful, the determined * parameter ID is placed into @result->key, the desired type is indicated in * @result->t and any converted value is placed into an appropriate member of * the union in @result. * * The function returns the parameter number if the parameter was matched, * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that * unknown parameters are okay and -EINVAL if there was a conversion issue or * the parameter wasn't recognised and unknowns aren't okay. */ int __fs_parse(struct p_log *log, const struct fs_parameter_spec *desc, struct fs_parameter *param, struct fs_parse_result *result) { const struct fs_parameter_spec *p; result->uint_64 = 0; p = fs_lookup_key(desc, param, &result->negated); if (!p) return -ENOPARAM; if (p->flags & fs_param_deprecated) warn_plog(log, "Deprecated parameter '%s'", param->key); /* Try to turn the type we were given into the type desired by the * parameter and give an error if we can't. */ if (is_flag(p)) { if (param->type != fs_value_is_flag) return inval_plog(log, "Unexpected value for '%s'", param->key); result->boolean = !result->negated; } else { int ret = p->type(log, p, param, result); if (ret) return ret; } return p->opt; } EXPORT_SYMBOL(__fs_parse); /** * fs_lookup_param - Look up a path referred to by a parameter * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, unsigned int flags, struct path *_path) { struct filename *f; bool put_f; int ret; switch (param->type) { case fs_value_is_string: f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: f = param->name; put_f = false; break; default: return invalf(fc, "%s: not usable as path", param->key); } ret = filename_lookup(param->dirfd, f, flags, _path, NULL); if (ret < 0) { errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); goto out; } if (want_bdev && !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { path_put(_path); _path->dentry = NULL; _path->mnt = NULL; errorf(fc, "%s: Non-blockdev passed as '%s'", param->key, f->name); ret = -ENOTBLK; } out: if (put_f) putname(f); return ret; } EXPORT_SYMBOL(fs_lookup_param); static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int b; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; b = lookup_constant(bool_names, param->string, -1); if (b == -1) return fs_param_bad_value(log, param); result->boolean = b; return 0; } EXPORT_SYMBOL(fs_param_is_bool); int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int base = (unsigned long)p->data; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtouint(param->string, base, &result->uint_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u32); int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoint(param->string, 0, &result->int_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_s32); int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoull(param->string, 0, &result->uint_64) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u64); int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { const struct constant_table *c; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; c = __lookup_constant(p->data, param->string); if (!c) return fs_param_bad_value(log, param); result->uint_32 = c->value; return 0; } EXPORT_SYMBOL(fs_param_is_enum); int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string || (!*param->string && !(p->flags & fs_param_can_be_empty))) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_string); int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: if ((!*param->string && !(p->flags & fs_param_can_be_empty)) || kstrtouint(param->string, 0, &result->uint_32) < 0) break; if (result->uint_32 <= INT_MAX) return 0; break; case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_fd); int fs_param_is_file_or_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: return fs_param_is_string(log, p, param, result); case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_file_or_string); int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kuid_t uid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); uid = make_kuid(current_user_ns(), result->uint_32); if (!uid_valid(uid)) return inval_plog(log, "Invalid uid '%s'", param->string); result->uid = uid; return 0; } EXPORT_SYMBOL(fs_param_is_uid); int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kgid_t gid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); gid = make_kgid(current_user_ns(), result->uint_32); if (!gid_valid(gid)) return inval_plog(log, "Invalid gid '%s'", param->string); result->gid = gid; return 0; } EXPORT_SYMBOL(fs_param_is_gid); int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { return 0; } EXPORT_SYMBOL(fs_param_is_blockdev); #ifdef CONFIG_VALIDATE_FS_PARSER /** * fs_validate_description - Validate a parameter specification array * @name: Owner name of the parameter specification array * @desc: The parameter specification array to validate. */ bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) { const struct fs_parameter_spec *param, *p2; bool good = true; for (param = desc; param->name; param++) { /* Check for duplicate parameter names */ for (p2 = desc; p2 < param; p2++) { if (strcmp(param->name, p2->name) == 0) { if (is_flag(param) != is_flag(p2)) continue; pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", name, param->name); good = false; } } } return good; } #endif /* CONFIG_VALIDATE_FS_PARSER */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for core RDMA functions. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_core #if !defined(_TRACE_RDMA_CORE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CORE_H #include <linux/tracepoint.h> #include <rdma/ib_verbs.h> /* * enum ib_poll_context, from include/rdma/ib_verbs.h */ #define IB_POLL_CTX_LIST \ ib_poll_ctx(DIRECT) \ ib_poll_ctx(SOFTIRQ) \ ib_poll_ctx(WORKQUEUE) \ ib_poll_ctx_end(UNBOUND_WORKQUEUE) #undef ib_poll_ctx #undef ib_poll_ctx_end #define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x); #define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x); IB_POLL_CTX_LIST #undef ib_poll_ctx #undef ib_poll_ctx_end #define ib_poll_ctx(x) { IB_POLL_##x, #x }, #define ib_poll_ctx_end(x) { IB_POLL_##x, #x } #define rdma_show_ib_poll_ctx(x) \ __print_symbolic(x, IB_POLL_CTX_LIST) /** ** Completion Queue events **/ TRACE_EVENT(cq_schedule, TP_PROTO( struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( cq->timestamp = ktime_get(); cq->interrupt = true; __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); TRACE_EVENT(cq_reschedule, TP_PROTO( struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( cq->timestamp = ktime_get(); cq->interrupt = false; __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); TRACE_EVENT(cq_process, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) __field(bool, interrupt) __field(s64, latency) ), TP_fast_assign( ktime_t latency = ktime_sub(ktime_get(), cq->timestamp); __entry->cq_id = cq->res.id; __entry->latency = ktime_to_us(latency); __entry->interrupt = cq->interrupt; ), TP_printk("cq.id=%u wake-up took %lld [us] from %s", __entry->cq_id, __entry->latency, __entry->interrupt ? "interrupt" : "reschedule" ) ); TRACE_EVENT(cq_poll, TP_PROTO( const struct ib_cq *cq, int requested, int rc ), TP_ARGS(cq, requested, rc), TP_STRUCT__entry( __field(u32, cq_id) __field(int, requested) __field(int, rc) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->requested = requested; __entry->rc = rc; ), TP_printk("cq.id=%u requested %d, returned %d", __entry->cq_id, __entry->requested, __entry->rc ) ); TRACE_EVENT(cq_drain_complete, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id ) ); TRACE_EVENT(cq_modify, TP_PROTO( const struct ib_cq *cq, u16 comps, u16 usec ), TP_ARGS(cq, comps, usec), TP_STRUCT__entry( __field(u32, cq_id) __field(unsigned int, comps) __field(unsigned int, usec) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->comps = comps; __entry->usec = usec; ), TP_printk("cq.id=%u comps=%u usec=%u", __entry->cq_id, __entry->comps, __entry->usec ) ); TRACE_EVENT(cq_alloc, TP_PROTO( const struct ib_cq *cq, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx ), TP_ARGS(cq, nr_cqe, comp_vector, poll_ctx), TP_STRUCT__entry( __field(u32, cq_id) __field(int, nr_cqe) __field(int, comp_vector) __field(unsigned long, poll_ctx) ), TP_fast_assign( __entry->cq_id = cq->res.id; __entry->nr_cqe = nr_cqe; __entry->comp_vector = comp_vector; __entry->poll_ctx = poll_ctx; ), TP_printk("cq.id=%u nr_cqe=%d comp_vector=%d poll_ctx=%s", __entry->cq_id, __entry->nr_cqe, __entry->comp_vector, rdma_show_ib_poll_ctx(__entry->poll_ctx) ) ); TRACE_EVENT(cq_alloc_error, TP_PROTO( int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, int rc ), TP_ARGS(nr_cqe, comp_vector, poll_ctx, rc), TP_STRUCT__entry( __field(int, rc) __field(int, nr_cqe) __field(int, comp_vector) __field(unsigned long, poll_ctx) ), TP_fast_assign( __entry->rc = rc; __entry->nr_cqe = nr_cqe; __entry->comp_vector = comp_vector; __entry->poll_ctx = poll_ctx; ), TP_printk("nr_cqe=%d comp_vector=%d poll_ctx=%s rc=%d", __entry->nr_cqe, __entry->comp_vector, rdma_show_ib_poll_ctx(__entry->poll_ctx), __entry->rc ) ); TRACE_EVENT(cq_free, TP_PROTO( const struct ib_cq *cq ), TP_ARGS(cq), TP_STRUCT__entry( __field(u32, cq_id) ), TP_fast_assign( __entry->cq_id = cq->res.id; ), TP_printk("cq.id=%u", __entry->cq_id) ); /** ** Memory Region events **/ /* * enum ib_mr_type, from include/rdma/ib_verbs.h */ #define IB_MR_TYPE_LIST \ ib_mr_type_item(MEM_REG) \ ib_mr_type_item(SG_GAPS) \ ib_mr_type_item(DM) \ ib_mr_type_item(USER) \ ib_mr_type_item(DMA) \ ib_mr_type_end(INTEGRITY) #undef ib_mr_type_item #undef ib_mr_type_end #define ib_mr_type_item(x) TRACE_DEFINE_ENUM(IB_MR_TYPE_##x); #define ib_mr_type_end(x) TRACE_DEFINE_ENUM(IB_MR_TYPE_##x); IB_MR_TYPE_LIST #undef ib_mr_type_item #undef ib_mr_type_end #define ib_mr_type_item(x) { IB_MR_TYPE_##x, #x }, #define ib_mr_type_end(x) { IB_MR_TYPE_##x, #x } #define rdma_show_ib_mr_type(x) \ __print_symbolic(x, IB_MR_TYPE_LIST) TRACE_EVENT(mr_alloc, TP_PROTO( const struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, const struct ib_mr *mr ), TP_ARGS(pd, mr_type, max_num_sg, mr), TP_STRUCT__entry( __field(u32, pd_id) __field(u32, mr_id) __field(u32, max_num_sg) __field(int, rc) __field(unsigned long, mr_type) ), TP_fast_assign( __entry->pd_id = pd->res.id; if (IS_ERR(mr)) { __entry->mr_id = 0; __entry->rc = PTR_ERR(mr); } else { __entry->mr_id = mr->res.id; __entry->rc = 0; } __entry->max_num_sg = max_num_sg; __entry->mr_type = mr_type; ), TP_printk("pd.id=%u mr.id=%u type=%s max_num_sg=%u rc=%d", __entry->pd_id, __entry->mr_id, rdma_show_ib_mr_type(__entry->mr_type), __entry->max_num_sg, __entry->rc) ); TRACE_EVENT(mr_integ_alloc, TP_PROTO( const struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg, const struct ib_mr *mr ), TP_ARGS(pd, max_num_data_sg, max_num_meta_sg, mr), TP_STRUCT__entry( __field(u32, pd_id) __field(u32, mr_id) __field(u32, max_num_data_sg) __field(u32, max_num_meta_sg) __field(int, rc) ), TP_fast_assign( __entry->pd_id = pd->res.id; if (IS_ERR(mr)) { __entry->mr_id = 0; __entry->rc = PTR_ERR(mr); } else { __entry->mr_id = mr->res.id; __entry->rc = 0; } __entry->max_num_data_sg = max_num_data_sg; __entry->max_num_meta_sg = max_num_meta_sg; ), TP_printk("pd.id=%u mr.id=%u max_num_data_sg=%u max_num_meta_sg=%u rc=%d", __entry->pd_id, __entry->mr_id, __entry->max_num_data_sg, __entry->max_num_meta_sg, __entry->rc) ); TRACE_EVENT(mr_dereg, TP_PROTO( const struct ib_mr *mr ), TP_ARGS(mr), TP_STRUCT__entry( __field(u32, id) ), TP_fast_assign( __entry->id = mr->res.id; ), TP_printk("mr.id=%u", __entry->id) ); #endif /* _TRACE_RDMA_CORE_H */ #include <trace/define_trace.h>
58 23 23 20 25 21 21 79 79 75 63 38 59 102 102 100 33 34 34 34 82 4 4 4 4 44 43 44 28 28 28 28 13 13 13 6 13 13 13 13 25 99 1 155 3 27 26 4 5 5 27 232 230 94 233 232 22 22 1 22 23 23 23 23 14 14 14 14 22 23 22 20 11 6 6 68 69 11 11 59 9 59 16 16 6 6 5 5 4 4 4 3 3 4 3 2 5 3 2 3 2 5 6 5 5 1 4 1 3 3 3 3 3 1 2 4 6 5 5 6 2 2 6 11 6 7 7 7 7 6 6 5 6 6 6 6 6 6 6 11 11 11 7 7 4 5 4 10 11 6 6 7 4 4 11 3 2 2 2 2 23 23 23 23 4 1 10 10 8 3 4 23 23 23 23 6 5 4 22 18 18 2 25 22 25 24 14 25 12 13 1 12 257 193 179 179 171 195 23 10 10 10 6 6 6 6 1 5 3 5 23 22 23 3 23 23 5 22 2 20 21 1 22 22 22 22 1 22 23 6 7 7 4 4 4 4 3 7 5 5 3 5 5 5 3 3 3 3 2 2 3 7 3 4 2 6 6 4 4 3 2 2 2 1 1 1 2 2 2 2 2 2 113 114 113 114 5 2 5 2 2 2 5 2 5 199 114 200 113 111 114 200 199 8 8 8 8 8 7 6 5 7 2 5 6 5 8 8 8 7 2 2 7 84 84 84 85 84 84 162 162 56 63 62 8 159 48 159 161 1 1 160 162 235 238 5 5 5 5 8 156 157 84 85 45 41 1 1 157 157 41 232 231 233 233 233 233 233 1 1 1 1 231 156 157 156 41 229 231 233 231 82 216 17 17 17 17 16 17 17 16 17 17 22 18 18 18 26 22 26 26 26 26 13 13 7 7 13 12 1 6 6 6 5 3 5 5 3 1 2 1 4 5 1 1 1 1 1 1 1 1 1 1 1 1 4 4 33 33 33 33 30 13 13 4 4 3 1 30 14 13 2 12 12 12 22 15 28 21 21 33 33 33 11 12 12 21 27 27 8 19 27 27 27 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 21 2 2 2 2 2 2 2 2 2 2 171 135 174 42 41 34 1 43 42 9 33 33 31 21 4 33 2 2 2 25 24 154 157 156 26 17 3 16 16 25 1 25 25 25 3 3 14 14 595 489 481 16 17 17 16 15 5 11 9 2 24 589 440 157 3 3 24 25 25 13 14 13 23 3 23 23 22 9 601 14 2 12 582 581 2 571 343 568 61 8 41 62 109 14 549 14 558 28 1 3 52 52 1 3 52 96 38 98 67 11 15 99 150 150 52 99 147 151 116 105 3 82 81 82 82 82 81 82 80 82 82 45 45 82 3 79 75 76 8 7 3 3 3 7 7 6 7 3 1 2 6 1 6 3 5 2 5 5 3 2 3 3 3 4 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 9 9 9 9 2 2 2 2 2 2 2 9 9 9 9 9 9 2 23 23 6 6 11 11 1 1 9 9 10 10 10 5 5 4 2 2 2 8 8 5 5 4 2 2 8 14 8 1 13 3 3 2 1 1 1 14 14 12 1 14 11 9 3 1 8 8 8 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> #include <linux/icmp.h> #include <net/dst.h> #include <net/flow.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/gre.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif #include <net/inet_dscp.h> #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_node { struct rb_node node; union { xfrm_address_t addr; struct rcu_head rcu; }; u8 prefixlen; struct rb_root root; /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; /* xfrm inexact policy search tree: * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); * | * +---- root_d: sorted by daddr:prefix * | | * | xfrm_pol_inexact_node * | | * | +- root: sorted by saddr/prefix * | | | * | | xfrm_pol_inexact_node * | | | * | | + root: unused * | | | * | | + hhead: saddr:daddr policies * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix * | | * | xfrm_pol_inexact_node * | | * | + root: unused * | | * | + hhead: saddr:any policies * | * +---- coarse policies and all any:any policies * * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree * 3. saddr:daddr list from 2nd level daddr tree * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two candidates have the same priority, the * struct xfrm_policy pos member with the lower number is used. * * This replicates previous single-list-search algorithm which would * return first matching policy in the (ordered-by-priority) list. */ struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; u16 family; u8 dir, type; }; struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; /* list containing '*:*' policies */ struct hlist_head hhead; seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; /* tree sorted by saddr/prefix */ struct rb_root root_s; /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, }; struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; struct xfrm_flow_keys { struct flow_dissector_key_basic basic; struct flow_dissector_key_control control; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; } addrs; struct flow_dissector_key_ip ip; struct flow_dissector_key_icmp icmp; struct flow_dissector_key_ports ports; struct flow_dissector_key_keyid gre; }; static struct flow_dissector xfrm_session_dissector __ro_after_init; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(params); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, dscp_t dscp, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct xfrm_dst_lookup_params params; struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } params.net = net; params.saddr = saddr; params.daddr = daddr; params.dscp = dscp; params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; if (x->encap) { switch (x->encap->encap_type) { case UDP_ENCAP_ESPINUDP: params.ipproto = IPPROTO_UDP; params.uli.ports.sport = x->encap->encap_sport; params.uli.ports.dport = x->encap->encap_dport; break; case TCP_ENCAP_ESPINTCP: params.ipproto = IPPROTO_TCP; params.uli.ports.sport = x->encap->encap_sport; params.uli.ports.dport = x->encap->encap_dport; break; } } dst = __xfrm_dst_lookup(family, &params); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = timer_container_of(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc_obj(struct xfrm_policy, gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer)) BUG(); xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { struct net *net = xp_net(policy); struct xfrm_state *x; xfrm_dev_policy_delete(policy); write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (timer_delete(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (timer_delete(&policy->timer)) xfrm_pol_put(policy); /* XXX: Flush state cache */ spin_lock_bh(&net->xfrm.xfrm_state_lock); hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { hlist_del_init_rcu(&x->state_cache); } spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) { struct xfrm_pol_inexact_bin *bin, *prev; struct xfrm_pol_inexact_key k = { .family = pol->family, .type = pol->type, .dir = dir, .if_id = pol->if_id, }; struct net *net = xp_net(pol); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); write_pnet(&k.net, net); bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); if (bin) return bin; bin = kzalloc_obj(*bin, GFP_ATOMIC); if (!bin) return NULL; bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, xfrm_pol_inexact_params); if (!prev) { list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); return bin; } kfree(bin); return IS_ERR(prev) ? NULL : prev; } static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, int family, u8 prefixlen) { if (xfrm_addr_any(addr, family)) return true; if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) return true; if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) return true; return false; } static bool xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) { const xfrm_address_t *addr; bool saddr_any, daddr_any; u8 prefixlen; addr = &policy->selector.saddr; prefixlen = policy->selector.prefixlen_s; saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); addr = &policy->selector.daddr; prefixlen = policy->selector.prefixlen_d; daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); return saddr_any && daddr_any; } static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, const xfrm_address_t *addr, u8 prefixlen) { node->addr = *addr; node->prefixlen = prefixlen; } static struct xfrm_pol_inexact_node * xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; node = kzalloc_obj(*node, GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); return node; } static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: if (prefixlen == 0) return 0; mask = ~0U << (32 - prefixlen); ma = ntohl(a->a4) & mask; mb = ntohl(b->a4) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; if (pdw) { delta = memcmp(a->a6, b->a6, pdw << 2); if (delta) return delta; } if (pbi) { mask = ~0U << (32 - pbi); ma = ntohl(a->a6[pdw]) & mask; mb = ntohl(b->a6[pdw]) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; } break; default: break; } return delta; } static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { unsigned int matched_s, matched_d; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (policy->walk.dead || !policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { if (policy->priority > p->priority) newpos = &p->bydst; else if (policy->priority == p->priority && policy->pos > p->pos) newpos = &p->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least * saddr or daddr for current node prefix. * * Matching both is fine, matching saddr in one policy * (but not daddr) and then matching only daddr in another * is a bug. */ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, &n->addr, n->prefixlen, family) == 0; matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, &n->addr, n->prefixlen, family) == 0; if (matches_s && matches_d) continue; WARN_ON_ONCE(!matches_s && !matches_d); if (matches_s) matched_s++; if (matches_d) matched_d++; WARN_ON_ONCE(matched_s && matched_d); } } static void xfrm_policy_inexact_node_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, struct rb_root *new, u16 family) { struct xfrm_pol_inexact_node *node; struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); restart: parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); prefixlen = min(node->prefixlen, n->prefixlen); delta = xfrm_policy_addr_delta(&n->addr, &node->addr, prefixlen, family); if (delta < 0) { p = &parent->rb_left; } else if (delta > 0) { p = &parent->rb_right; } else { bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } node->prefixlen = prefixlen; xfrm_policy_inexact_list_reinsert(net, node, family); if (same_prefixlen) { kfree_rcu(n, rcu); return; } rb_erase(*p, new); kfree_rcu(n, rcu); n = node; goto restart; } } rb_link_node_rcu(&n->node, parent, p); rb_insert_color(&n->node, new); } /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; struct rb_node *rnode; /* To-be-merged node v has a subtree. * * Dismantle it and insert its nodes to n->root. */ while ((rnode = rb_first(&v->root)) != NULL) { node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); rb_erase(&node->node, &v->root); xfrm_policy_inexact_node_reinsert(net, node, &n->root, family); } hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } xfrm_policy_inexact_list_reinsert(net, n, family); } static struct xfrm_pol_inexact_node * xfrm_policy_inexact_insert_node(struct net *net, struct rb_root *root, xfrm_address_t *addr, u16 family, u8 prefixlen, u8 dir) { struct xfrm_pol_inexact_node *cached = NULL; struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; p = &root->rb_node; while (*p) { int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta == 0 && prefixlen >= node->prefixlen) { WARN_ON_ONCE(cached); /* ipsec policies got lost */ return node; } if (delta < 0) p = &parent->rb_left; else p = &parent->rb_right; if (prefixlen < node->prefixlen) { delta = xfrm_policy_addr_delta(addr, &node->addr, prefixlen, family); if (delta) continue; /* This node is a subnet of the new prefix. It needs * to be removed and re-inserted with the smaller * prefix and all nodes that are now also covered * by the reduced prefixlen. */ rb_erase(&node->node, root); if (!cached) { xfrm_pol_inexact_node_init(node, addr, prefixlen); cached = node; } else { /* This node also falls within the new * prefixlen. Merge the to-be-reinserted * node and this one. */ xfrm_policy_inexact_node_merge(net, node, cached, family); kfree_rcu(node, rcu); } /* restart */ p = &root->rb_node; parent = NULL; } } node = cached; if (!node) { node = xfrm_pol_inexact_node_alloc(addr, prefixlen); if (!node) return NULL; } rb_link_node_rcu(&node->node, parent, p); rb_insert_color(&node->node, root); return node; } static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) { struct xfrm_pol_inexact_node *node; struct rb_node *rn = rb_first(r); while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } rb_erase(&node->node, r); kfree_rcu(node, rcu); } } static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { list_del(&b->inexact_bins); kfree_rcu(b, rcu); } } static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) { struct net *net = read_pnet(&b->k.net); spin_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_inexact_prune_bin(b, false); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static void __xfrm_policy_inexact_flush(struct net *net) { struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(bin, false); } static struct hlist_head * xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, struct xfrm_policy *policy, u8 dir) { struct xfrm_pol_inexact_node *n; struct net *net; net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_s, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } /* daddr is fixed */ write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_d, &policy->selector.daddr, policy->family, policy->selector.prefixlen_d, dir); write_seqcount_end(&bin->count); if (!n) return NULL; /* saddr is wildcard */ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, policy->family, policy->selector.prefixlen_s)) return &n->hhead; write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &n->root, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *delpol; struct hlist_head *chain; struct net *net; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) return ERR_PTR(-ENOMEM); net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); if (!chain) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-ENOMEM); } delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-EEXIST); } if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy) { int dir; if (policy->walk.dead) return true; dir = xfrm_policy_id2dir(policy->index); return dir >= XFRM_POLICY_MAX; } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_node *newpos; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; if (xfrm_policy_is_dead_or_sk(policy)) continue; dir = xfrm_policy_id2dir(policy->index); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; sbits = lbits4; } else { dbits = rbits6; sbits = lbits6; } } else { if (policy->family == AF_INET) { dbits = lbits4; sbits = rbits4; } else { dbits = lbits6; sbits = rbits6; } } if (policy->selector.prefixlen_d < dbits || policy->selector.prefixlen_s < sbits) continue; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) goto out_unlock; } for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (xfrm_policy_is_dead_or_sk(policy)) continue; hlist_del_rcu(&policy->bydst); newpos = NULL; dir = xfrm_policy_id2dir(policy->index); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); continue; } hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } out_unlock: __xfrm_policy_inexact_flush(net); write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (net->xfrm.idx_generator | dir); net->xfrm.idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (timer_delete(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_bin *b = data; return xfrm_pol_bin_key(&b->k, 0, seed); } static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xfrm_pol_inexact_key *key = arg->key; const struct xfrm_pol_inexact_bin *b = ptr; int ret; if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) return -1; ret = b->k.dir ^ key->dir; if (ret) return ret; ret = b->k.type ^ key->type; if (ret) return ret; ret = b->k.family ^ key->family; if (ret) return ret; return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), .hashfn = xfrm_pol_bin_key, .obj_hashfn = xfrm_pol_bin_obj, .obj_cmpfn = xfrm_pol_bin_cmp, .automatic_shrinking = true, }; static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) { struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = pol; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else /* Packet offload policies enter to the head * to speed-up lookups. */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *delpol; struct hlist_head *chain; /* Sanitize mark before store */ policy->mark.v &= policy->mark.m; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); else delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return PTR_ERR(delpol); } __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; } return NULL; } struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { struct xfrm_pol_inexact_candidates cand; int i; bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } if (!xfrm_policy_find_inexact_candidates(&cand, bin, &sel->saddr, &sel->daddr)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } pol = NULL; for (i = 0; i < ARRAY_SIZE(cand.res); i++) { struct xfrm_policy *tmp; tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); if (!tmp) continue; if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, sel, ctx); } if (pol) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete(pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); if (bin && delete) xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_dev_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; int seq; again: seq = read_seqcount_begin(count); parent = rcu_dereference_raw(r->rb_node); while (parent) { struct xfrm_pol_inexact_node *node; int delta; node = rb_entry(parent, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta < 0) { parent = rcu_dereference_raw(parent->rb_left); continue; } else if (delta > 0) { parent = rcu_dereference_raw(parent->rb_right); continue; } return node; } if (read_seqcount_retry(count, seq)) goto again; return NULL; } static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct xfrm_pol_inexact_node *n; u16 family; if (!b) return false; family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; return true; } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, .if_id = if_id, }; write_pnet(&k.net, net); return rhashtable_lookup(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; } static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { int err; if (pol->priority > priority) break; err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); continue; } if (prefer) { /* matches. Is it older than *prefer? */ if (pol->priority == priority && prefer->pos < pol->pos) return prefer; } return pol; } return NULL; } static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, fl, type, family, if_id); if (!tmp) continue; if (IS_ERR(tmp)) return tmp; prefer = tmp; } return prefer; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) goto skip_inexact; bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) goto fail; } skip_inexact: if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static u32 xfrm_gen_pos_slow(struct net *net) { struct xfrm_policy *policy; u32 i = 0; /* oldest entry is last in list */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (!xfrm_policy_is_dead_or_sk(policy)) policy->pos = ++i; } return i; } static u32 xfrm_gen_pos(struct net *net) { const struct xfrm_policy *policy; u32 i = 0; /* most recently added policy is at the head of the list */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { if (xfrm_policy_is_dead_or_sk(policy)) continue; if (policy->pos == UINT_MAX) return xfrm_gen_pos_slow(net); i = policy->pos + 1; break; } return i; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_FWD: case XFRM_POLICY_OUT: pol->pos = xfrm_gen_pos(net); break; } list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(saddr, params); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { struct xfrm_dst_lookup_params params; memset(&params, 0, sizeof(params)); params.net = net; params.oif = fl->flowi_oif; params.daddr = remote; error = xfrm_get_saddr(tmpl->encap_family, &tmp, &params); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); xfrm_state_put(x); error = -EINVAL; goto fail; } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) return fl->u.ip4.flowi4_dscp; return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int family = policy->selector.family; xfrm_address_t saddr, daddr; dscp_t dscp; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); dscp = xfrm_get_dscp(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; int oif; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET) family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->lastuse = now; dst1->input = dst_discard; if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { dst1->output = xfrm[i]->mode_cbs->output; } else { rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) dst1->output = afinfo->output; else dst1->output = dst_discard_out; rcu_read_unlock(); } xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) { *num_pols = 0; return PTR_ERR(pols[0]); } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = timer_container_of(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, dst->ops->family); skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family); skb->mark = skb_mark; dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb_to_full_sk(skb), skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (timer_delete(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst->xfrm && (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; if (!sp || idx < 0 || idx >= sp->len) return 0; x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)) && (if_id == 0 || if_id == x->if_id); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family, u32 if_id) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (idx < sp->verified_cnt) { /* Secpath entry previously verified, consider optional and * continue searching */ continue; } if (start == -1) start = -2-idx; break; } } return start; } static void decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi4 *fl4 = &fl->u.ip4; memset(fl4, 0, sizeof(struct flowi4)); if (reverse) { fl4->saddr = flkeys->addrs.ipv4.dst; fl4->daddr = flkeys->addrs.ipv4.src; fl4->fl4_sport = flkeys->ports.dst; fl4->fl4_dport = flkeys->ports.src; } else { fl4->saddr = flkeys->addrs.ipv4.src; fl4->daddr = flkeys->addrs.ipv4.dst; fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl4->fl4_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMP: fl4->fl4_icmp_type = flkeys->icmp.type; fl4->fl4_icmp_code = flkeys->icmp.code; break; } fl4->flowi4_proto = flkeys->basic.ip_proto; fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) static void decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi6 *fl6 = &fl->u.ip6; memset(fl6, 0, sizeof(struct flowi6)); if (reverse) { fl6->saddr = flkeys->addrs.ipv6.dst; fl6->daddr = flkeys->addrs.ipv6.src; fl6->fl6_sport = flkeys->ports.dst; fl6->fl6_dport = flkeys->ports.src; } else { fl6->saddr = flkeys->addrs.ipv6.src; fl6->daddr = flkeys->addrs.ipv6.dst; fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl6->fl6_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMPV6: fl6->fl6_icmp_type = flkeys->icmp.type; fl6->fl6_icmp_code = flkeys->icmp.code; break; } fl6->flowi6_proto = flkeys->basic.ip_proto; } #endif int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { struct xfrm_flow_keys flkeys; memset(&flkeys, 0, sizeof(flkeys)); __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP); switch (family) { case AF_INET: decode_session4(&flkeys, fl, reverse); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: decode_session6(&flkeys, fl, reverse); break; #endif default: return -EAFNOSUPPORT; } fl->flowi_mark = skb->mark; if (reverse) { fl->flowi_oif = skb->skb_iif; } else { int oif = 0; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; fl->flowi_oif = oif; } return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } static bool icmp_err_packet(const struct flowi *fl, unsigned short family) { const struct flowi4 *fl4 = &fl->u.ip4; if (family == AF_INET && fl4->flowi4_proto == IPPROTO_ICMP && (fl4->fl4_icmp_type == ICMP_DEST_UNREACH || fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED)) return true; #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) { const struct flowi6 *fl6 = &fl->u.ip6; if (fl6->flowi6_proto == IPPROTO_ICMPV6 && (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH || fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG || fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED)) return true; } #endif return false; } static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, const struct flowi *fl, struct flowi *fl1) { bool ret = true; struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); int hl = family == AF_INET ? (sizeof(struct iphdr) + sizeof(struct icmphdr)) : (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)); if (!newskb) return true; if (!pskb_pull(newskb, hl)) goto out; skb_reset_network_header(newskb); if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0) goto out; fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; out: consume_skb(newskb); return ret; } static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family, const struct xfrm_selector *sel, const struct flowi *fl) { bool ret = false; if (icmp_err_packet(fl, family)) { struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return ret; ret = xfrm_selector_match(sel, &fl1, family); } return ret; } static inline struct xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb, const struct flowi *fl, unsigned short family, u32 if_id) { struct xfrm_policy *pol = NULL; if (icmp_err_packet(fl, family)) { struct flowi fl1; struct net *net = dev_net(skb->dev); if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return pol; pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id); if (IS_ERR(pol)) pol = NULL; } return pol; } static inline struct dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl, unsigned short family, struct dst_entry *dst) { if (icmp_err_packet(fl, family)) { struct net *net = dev_net(skb->dev); struct dst_entry *dst2; struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return dst; dst_hold(dst); dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP)); if (IS_ERR(dst2)) return dst; if (dst2->xfrm) { dst_release(dst); dst = dst2; } else { dst_release(dst2); } } return dst; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { struct xfrm_if_decode_session_result r; if (ifcb->decode_session(skb, family, &r)) { if_id = r.if_id; net = r.net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ sp = skb_sec_path(skb); if (sp) { int i; for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; int ret = 0; if (!xfrm_selector_match(&x->sel, &fl, family)) { ret = 1; if (x->props.flags & XFRM_STATE_ICMP && xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl)) ret = 0; if (ret) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol && dir == XFRM_POLICY_FWD) pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id); if (!pol) { const bool is_crypto_offload = sp && (xfrm_input_state(skb)->xso.type == XFRM_DEV_OFFLOAD_CRYPTO); if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } if (sp && secpath_has_nontransport(sp, 0, &xerr_idx) && !is_crypto_offload) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } /* This lockless write can happen from different cpus. */ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); xfrm_pol_put(pols[0]); return 0; } /* This write can happen from different cpus. */ WRITE_ONCE(pols[1]->curlft.use_time, ktime_get_real_seconds()); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int i, k = 0; int ti = 0; sp = skb_sec_path(skb); if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET && sp == &dummy) /* This policy template was already checked by HW * and secpath was removed in __xfrm_policy_check2. */ goto out; /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. * Upon success, marks secpath entries as having been * verified to allow them to be skipped in future policy * checks (e.g. nested tunnels). */ for (i = xfrm_nr - 1; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } out: xfrm_pols_put(pols, npols); sp->verified_cnt = k; return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(net, skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); dst = skb_dst(skb); if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } /* ignore return value from skb_dstref_steal, xfrm_lookup takes * care of dropping the refcnt if needed. */ skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } if (dst && !dst->xfrm) dst = xfrm_out_fwd_icmp(skb, &fl, family, dst); skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as xfrm_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { if (READ_ONCE(dst->obsolete)) sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (rcu_access_pointer(xfrm_policy_afinfo[i]) != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir, err; if (net_eq(net, &init_net)) { xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; htab = &net->xfrm.policy_bydst[dir]; rcu_assign_pointer(htab->table, xfrm_hash_alloc(sz)); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void __net_exit xfrm_net_pre_exit(struct net *net) { disable_work_sync(&net->xfrm.policy_hthresh.work); flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); } static void xfrm_policy_fini(struct net *net) { struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(rcu_dereference_protected(htab->table, true))); xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); spin_lock_bh(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(b, true); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT; rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; rv = xfrm_nat_keepalive_net_init(net); if (rv < 0) goto out_nat_keepalive; return 0; out_nat_keepalive: xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_nat_keepalive_net_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .pre_exit = xfrm_net_pre_exit, .exit = xfrm_net_exit, }; static const struct flow_dissector_key xfrm_flow_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct xfrm_flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct xfrm_flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct xfrm_flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct xfrm_flow_keys, gre), }, { .key_id = FLOW_DISSECTOR_KEY_IP, .offset = offsetof(struct xfrm_flow_keys, ip), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct xfrm_flow_keys, icmp), }, }; void __init xfrm_init(void) { skb_flow_dissector_init(&xfrm_session_dissector, xfrm_flow_dissector_keys, ARRAY_SIZE(xfrm_flow_dissector_keys)); register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif register_xfrm_state_bpf(); xfrm_nat_keepalive_init(AF_INET); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol; struct flowi fl; memset(&fl, 0, sizeof(fl)); fl.flowi_proto = sel->proto; switch (sel->family) { case AF_INET: fl.u.ip4.saddr = sel->saddr.a4; fl.u.ip4.daddr = sel->daddr.a4; if (sel->proto == IPSEC_ULPROTO_ANY) break; fl.u.flowi4_oif = sel->ifindex; fl.u.ip4.fl4_sport = sel->sport; fl.u.ip4.fl4_dport = sel->dport; break; case AF_INET6: fl.u.ip6.saddr = sel->saddr.in6; fl.u.ip6.daddr = sel->daddr.in6; if (sel->proto == IPSEC_ULPROTO_ANY) break; fl.u.flowi6_oif = sel->ifindex; fl.u.ip6.fl4_sport = sel->sport; fl.u.ip6.fl4_dport = sel->dport; break; default: return ERR_PTR(-EAFNOSUPPORT); } rcu_read_lock(); pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); if (IS_ERR_OR_NULL(pol)) goto out_unlock; out_unlock: rcu_read_unlock(); return pol; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET && pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) { NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; } } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ err = xfrm_migrate_check(m, num_migrate, extack); if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); if (IS_ERR_OR_NULL(pol)) { NL_SET_ERR_MSG(extack, "Target policy not found"); err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ err = xfrm_policy_migrate(pol, m, num_migrate, extack); if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, net, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif
6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/kthread.h> #include <linux/delay.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/time.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/kernel.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "dir.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "recovery.h" #include "rgrp.h" #include "super.h" #include "trans.h" #include "util.h" #include "sys.h" #include "xattr.h" #include "lops.h" enum evict_behavior { EVICT_SHOULD_DELETE, EVICT_SHOULD_SKIP_DELETE, EVICT_SHOULD_DEFER_DELETE, }; /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock * */ void gfs2_jindex_free(struct gfs2_sbd *sdp) { struct list_head list; struct gfs2_jdesc *jd; spin_lock(&sdp->sd_jindex_spin); list_add(&list, &sdp->sd_jindex_list); list_del_init(&sdp->sd_jindex_list); sdp->sd_journals = 0; spin_unlock(&sdp->sd_jindex_spin); down_write(&sdp->sd_log_flush_lock); sdp->sd_jdesc = NULL; up_write(&sdp->sd_log_flush_lock); while (!list_empty(&list)) { jd = list_first_entry(&list, struct gfs2_jdesc, jd_list); BUG_ON(jd->jd_log_bio); gfs2_free_journal_extents(jd); list_del(&jd->jd_list); iput(jd->jd_inode); jd->jd_inode = NULL; kfree(jd); } } static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) { struct gfs2_jdesc *jd; list_for_each_entry(jd, head, jd_list) { if (jd->jd_jid == jid) return jd; } return NULL; } struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) { struct gfs2_jdesc *jd; spin_lock(&sdp->sd_jindex_spin); jd = jdesc_find_i(&sdp->sd_jindex_list, jid); spin_unlock(&sdp->sd_jindex_spin); return jd; } int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u64 size = i_size_read(jd->jd_inode); if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) return -EIO; jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; if (gfs2_write_alloc_required(ip, 0, size)) { gfs2_consist_inode(ip); return -EIO; } return 0; } /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem * * Returns: errno */ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; int error; j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawn(sdp)) return -EIO; if (sdp->sd_log_sequence == 0) { fs_err(sdp, "unknown status of our own journal jid %d", sdp->sd_lockstruct.ls_jid); return -EIO; } error = gfs2_quota_init(sdp); if (!error && gfs2_withdrawn(sdp)) { gfs2_quota_cleanup(sdp); error = -EIO; } if (!error) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); return error; } void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; sc->sc_total = be64_to_cpu(str->sc_total); sc->sc_free = be64_to_cpu(str->sc_free); sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; str->sc_total = cpu_to_be64(sc->sc_total); str->sc_free = cpu_to_be64(sc->sc_free); str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); } int gfs2_statfs_init(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct buffer_head *m_bh; struct gfs2_holder gh; int error; error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &gh); if (error) return error; error = gfs2_meta_inode_buffer(m_ip, &m_bh); if (error) goto out; if (sdp->sd_args.ar_spectator) { spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); } else { spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); } brelse(m_bh); out: gfs2_glock_dq_uninit(&gh); return 0; } void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes) { struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; s64 x, y; int need_sync = 0; gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode)); if (sdp->sd_args.ar_statfs_percent) { x = 100 * l_sc->sc_free; y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; if (x >= y || x <= -y) need_sync = 1; } spin_unlock(&sdp->sd_statfs_spin); if (need_sync) gfs2_wake_up_statfs(sdp); } void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); gfs2_trans_add_meta(m_ip->i_gl, m_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; m_sc->sc_free += l_sc->sc_free; m_sc->sc_dinodes += l_sc->sc_dinodes; memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); } int gfs2_statfs_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_holder gh; struct buffer_head *m_bh; int error; error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &gh); if (error) goto out; error = gfs2_meta_inode_buffer(m_ip, &m_bh); if (error) goto out_unlock; spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { spin_unlock(&sdp->sd_statfs_spin); goto out_bh; } spin_unlock(&sdp->sd_statfs_spin); error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); if (error) goto out_bh; update_statfs(sdp, m_bh); sdp->sd_statfs_force_sync = 0; gfs2_trans_end(sdp); out_bh: brelse(m_bh); out_unlock: gfs2_glock_dq_uninit(&gh); out: return error; } struct lfcc { struct list_head list; struct gfs2_holder gh; }; /** * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all * journals are clean * @sdp: the file system * * Returns: errno */ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; struct lfcc *lfcc; LIST_HEAD(list); struct gfs2_log_header_host lh; int error, error2; /* * Grab all the journal glocks in SH mode. We are *probably* doing * that to prevent recovery. */ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc_obj(struct lfcc); if (!lfcc) { error = -ENOMEM; goto out; } ip = GFS2_I(jd->jd_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); if (error) { kfree(lfcc); goto out; } list_add(&lfcc->list, &list); } gfs2_freeze_unlock(sdp); error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, LM_FLAG_RECOVER | GL_NOPID, &sdp->sd_freeze_gh); if (error) goto relock_shared; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); if (error) break; error = gfs2_find_jhead(jd, &lh); if (error) break; if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { error = -EBUSY; break; } } if (!error) goto out; /* success */ gfs2_freeze_unlock(sdp); relock_shared: error2 = gfs2_freeze_lock_shared(sdp); gfs2_assert_withdraw(sdp, !error2); out: while (!list_empty(&list)) { lfcc = list_first_entry(&list, struct lfcc, list); list_del(&lfcc->list); gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); } return error; } void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { const struct inode *inode = &ip->i_inode; struct gfs2_dinode *str = buf; str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); str->di_mode = cpu_to_be32(inode->i_mode); str->di_uid = cpu_to_be32(i_uid_read(inode)); str->di_gid = cpu_to_be32(i_gid_read(inode)); str->di_nlink = cpu_to_be32(inode->i_nlink); str->di_size = cpu_to_be64(i_size_read(inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); str->di_atime = cpu_to_be64(inode_get_atime_sec(inode)); str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode)); str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode)); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); str->di_generation = cpu_to_be64(ip->i_generation); str->di_flags = cpu_to_be32(ip->i_diskflags); str->di_height = cpu_to_be16(ip->i_height); str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) && !(ip->i_diskflags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); str->di_depth = cpu_to_be16(ip->i_depth); str->di_entries = cpu_to_be32(ip->i_entries); str->di_eattr = cpu_to_be64(ip->i_eattr); str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode)); str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode)); str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode)); } /** * gfs2_write_inode - Make sure the inode is stable on the disk * @inode: The inode * @wbc: The writeback control structure * * Returns: errno */ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip)); if (flush_all) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_WRITE_INODE); if (bdi_wb_dirty_exceeded(bdi)) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); if (flush_all) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); else { spin_lock(&inode->i_lock); if (!(inode->i_flags & I_DIRTY)) gfs2_ordered_del_inode(ip); spin_unlock(&inode->i_lock); } return ret; } /** * gfs2_dirty_inode - check for atime updates * @inode: The inode in question * @flags: The type of dirty * * Unfortunately it can be called under any combination of inode * glock and freeze glock, so we have to check carefully. * * At the moment this deals only with atime - it should be possible * to expand that role in future, once a review of the locking has * been carried out. */ static void gfs2_dirty_inode(struct inode *inode, int flags) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; struct gfs2_holder gh; int need_unlock = 0; int need_endtrans = 0; int ret; /* This can only happen during incomplete inode creation. */ if (unlikely(!ip->i_gl)) return; if (gfs2_withdrawn(sdp)) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) { fs_err(sdp, "dirty_inode: glock %d\n", ret); gfs2_dump_glock(NULL, ip->i_gl, true); return; } need_unlock = 1; } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) return; if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE, 0); if (ret) { fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); goto out; } need_endtrans = 1; } ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { gfs2_trans_add_meta(ip->i_gl, bh); gfs2_dinode_out(ip, bh->b_data); brelse(bh); } if (need_endtrans) gfs2_trans_end(sdp); out: if (need_unlock) gfs2_glock_dq_uninit(&gh); } /** * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one * @sdp: the filesystem * * Returns: errno */ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); if (!test_bit(SDF_KILL, &sdp->sd_flags)) gfs2_flush_delete_work(sdp); gfs2_destroy_threads(sdp); if (log_write_allowed) { gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); /* We do two log flushes here. The first one commits dirty inodes * and rgrps to the journal, but queues up revokes to the ail list. * The second flush writes out and removes the revokes. * * The first must be done before the FLUSH_SHUTDOWN code * clears the LIVE flag, otherwise it will not be able to start * a transaction to write its revokes, and the error will cause * a withdraw of the file system. */ gfs2_log_flush(sdp, NULL, GFS2_LFC_MAKE_FS_RO); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LFC_MAKE_FS_RO); wait_event_timeout(sdp->sd_log_waitq, gfs2_log_is_empty(sdp), HZ * 5); gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); } gfs2_quota_cleanup(sdp); } /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock * */ static void gfs2_put_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_jdesc *jd; /* No more recovery requests */ set_bit(SDF_NORECOVERY, &sdp->sd_flags); smp_mb(); /* Wait on outstanding recovery */ restart: spin_lock(&sdp->sd_jindex_spin); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) continue; spin_unlock(&sdp->sd_jindex_spin); wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); goto restart; } spin_unlock(&sdp->sd_jindex_spin); /* Wait for withdraw to complete */ flush_work(&sdp->sd_withdraw_work); if (!sb_rdonly(sb)) gfs2_make_fs_ro(sdp); else { if (gfs2_withdrawn(sdp)) gfs2_destroy_threads(sdp); gfs2_quota_cleanup(sdp); } /* At this point, we're through modifying the disk */ /* Release stuff */ gfs2_freeze_unlock(sdp); iput(sdp->sd_jindex); iput(sdp->sd_statfs_inode); iput(sdp->sd_rindex); iput(sdp->sd_quota_inode); gfs2_glock_put(sdp->sd_rename_gl); gfs2_glock_put(sdp->sd_freeze_gl); if (!sdp->sd_args.ar_spectator) { if (gfs2_holder_initialized(&sdp->sd_journal_gh)) gfs2_glock_dq_uninit(&sdp->sd_journal_gh); if (gfs2_holder_initialized(&sdp->sd_jinode_gh)) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); brelse(sdp->sd_sc_bh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); free_local_statfs_inodes(sdp); iput(sdp->sd_qc_inode); } gfs2_glock_dq_uninit(&sdp->sd_live_gh); gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); iput(sdp->sd_inode); gfs2_delete_debugfs_file(sdp); gfs2_sys_fs_del(sdp); free_sbd(sdp); } /** * gfs2_sync_fs - sync the filesystem * @sb: the superblock * @wait: true to wait for completion * * Flushes the log to disk. */ static int gfs2_sync_fs(struct super_block *sb, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); if (wait) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_SYNC_FS); return sdp->sd_log_error; } static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who, const void *freeze_owner) { struct super_block *sb = sdp->sd_vfs; int error; error = gfs2_freeze_lock_shared(sdp); if (error) goto fail; error = thaw_super(sb, who, freeze_owner); if (!error) return 0; fail: fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error); gfs2_assert_withdraw(sdp, 0); return error; } void gfs2_freeze_func(struct work_struct *work) { struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); struct super_block *sb = sdp->sd_vfs; int error; mutex_lock(&sdp->sd_freeze_mutex); error = -EBUSY; if (test_bit(SDF_FROZEN, &sdp->sd_flags)) goto freeze_failed; error = freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); if (error) goto freeze_failed; gfs2_freeze_unlock(sdp); set_bit(SDF_FROZEN, &sdp->sd_flags); error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE, NULL); if (error) goto out; clear_bit(SDF_FROZEN, &sdp->sd_flags); goto out; freeze_failed: fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); out: mutex_unlock(&sdp->sd_freeze_mutex); deactivate_super(sb); } /** * gfs2_freeze_super - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem * @who: freeze flags * @freeze_owner: owner of the freeze * */ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; if (!mutex_trylock(&sdp->sd_freeze_mutex)) return -EBUSY; if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { mutex_unlock(&sdp->sd_freeze_mutex); return -EBUSY; } for (;;) { error = freeze_super(sb, who, freeze_owner); if (error) { fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); goto out; } error = gfs2_lock_fs_check_clean(sdp); if (!error) { set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); set_bit(SDF_FROZEN, &sdp->sd_flags); break; } (void)gfs2_do_thaw(sdp, who, freeze_owner); if (error == -EBUSY) fs_err(sdp, "waiting for recovery before freeze\n"); else if (error == -EIO) { fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due " "to recovery error.\n"); goto out; } else { fs_err(sdp, "error freezing FS: %d\n", error); } fs_err(sdp, "retrying...\n"); msleep(1000); } out: mutex_unlock(&sdp->sd_freeze_mutex); return error; } static int gfs2_freeze_fs(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); if (gfs2_withdrawn(sdp)) return -EIO; } return 0; } /** * gfs2_thaw_super - reallow writes to the filesystem * @sb: the VFS structure for the filesystem * @who: freeze flags * @freeze_owner: owner of the freeze * */ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; if (!mutex_trylock(&sdp->sd_freeze_mutex)) return -EBUSY; if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) { mutex_unlock(&sdp->sd_freeze_mutex); return -EINVAL; } atomic_inc(&sb->s_active); gfs2_freeze_unlock(sdp); error = gfs2_do_thaw(sdp, who, freeze_owner); if (!error) { clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); clear_bit(SDF_FROZEN, &sdp->sd_flags); } mutex_unlock(&sdp->sd_freeze_mutex); deactivate_super(sb); return error; } /** * statfs_slow_fill - fill in the sg for a given RG * @rgd: the RG * @sc: the sc structure * * Returns: 0 on success, -ESTALE if the LVB is invalid */ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, struct gfs2_statfs_change_host *sc) { gfs2_rgrp_verify(rgd); sc->sc_total += rgd->rd_data; sc->sc_free += rgd->rd_free; sc->sc_dinodes += rgd->rd_dinodes; return 0; } /** * gfs2_statfs_slow - Stat a filesystem using asynchronous locking * @sdp: the filesystem * @sc: the sc info that will be returned * * Any error (other than a signal) will cause this routine to fall back * to the synchronous version. * * FIXME: This really shouldn't busy wait like this. * * Returns: errno */ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { struct gfs2_rgrpd *rgd_next; struct gfs2_holder *gha, *gh; unsigned int slots = 64; unsigned int x; int done; int error = 0, err; memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); gha = kmalloc_objs(struct gfs2_holder, slots); if (!gha) return -ENOMEM; for (x = 0; x < slots; x++) gfs2_holder_mark_uninitialized(gha + x); rgd_next = gfs2_rgrpd_get_first(sdp); for (;;) { done = 1; for (x = 0; x < slots; x++) { gh = gha + x; if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) { err = gfs2_glock_wait(gh); if (err) { gfs2_holder_uninit(gh); error = err; } else { if (!error) { struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gh->gh_gl); error = statfs_slow_fill(rgd, sc); } gfs2_glock_dq_uninit(gh); } } if (gfs2_holder_initialized(gh)) done = 0; else if (rgd_next && !error) { error = gfs2_glock_nq_init(rgd_next->rd_gl, LM_ST_SHARED, GL_ASYNC, gh); rgd_next = gfs2_rgrpd_get_next(rgd_next); done = 0; } if (signal_pending(current)) error = -ERESTARTSYS; } if (done) break; yield(); } kfree(gha); return error; } /** * gfs2_statfs_i - Do a statfs * @sdp: the filesystem * @sc: the sc structure * * Returns: errno */ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; spin_lock(&sdp->sd_statfs_spin); *sc = *m_sc; sc->sc_total += l_sc->sc_total; sc->sc_free += l_sc->sc_free; sc->sc_dinodes += l_sc->sc_dinodes; spin_unlock(&sdp->sd_statfs_spin); if (sc->sc_free < 0) sc->sc_free = 0; if (sc->sc_free > sc->sc_total) sc->sc_free = sc->sc_total; if (sc->sc_dinodes < 0) sc->sc_dinodes = 0; return 0; } /** * gfs2_statfs - Gather and return stats about the filesystem * @dentry: The name of the link * @buf: The buffer * * Returns: 0 on success or error code */ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_statfs_change_host sc; int error; error = gfs2_rindex_update(sdp); if (error) return error; if (gfs2_tune_get(sdp, gt_statfs_slow)) error = gfs2_statfs_slow(sdp, &sc); else error = gfs2_statfs_i(sdp, &sc); if (error) return error; buf->f_type = GFS2_MAGIC; buf->f_bsize = sdp->sd_sb.sb_bsize; buf->f_blocks = sc.sc_total; buf->f_bfree = sc.sc_free; buf->f_bavail = sc.sc_free; buf->f_files = sc.sc_dinodes + sc.sc_free; buf->f_ffree = sc.sc_free; buf->f_namelen = GFS2_FNAMESIZE; buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } /** * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag * is set on the iopen glock. If we didn't do a disk read since the * remote node removed the final link then we might otherwise miss * this event. This check ensures that this node will deallocate the * inode's blocks, or alternatively pass the baton on to another * node for later deallocation. */ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (glock_needs_demote(gl)) clear_nlink(inode); } /* * When under memory pressure when an inode's link count has dropped to * zero, defer deleting the inode to the delete workqueue. This avoids * calling into DLM under memory pressure, which can deadlock. */ if (!inode->i_nlink && unlikely(current->flags & PF_MEMALLOC) && gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); if (!gfs2_queue_verify_delete(gl, true)) gfs2_glock_put_async(gl); return 0; } /* * No longer cache inodes when trying to evict them all. */ if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; return inode_generic_drop(inode); } /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure * @root: root of this (sub)tree * * Returns: 0 on success or error code */ static int gfs2_show_options(struct seq_file *s, struct dentry *root) { struct gfs2_sbd *sdp = root->d_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; unsigned int logd_secs, statfs_slow, statfs_quantum, quota_quantum; spin_lock(&sdp->sd_tune.gt_spin); logd_secs = sdp->sd_tune.gt_logd_secs; quota_quantum = sdp->sd_tune.gt_quota_quantum; statfs_quantum = sdp->sd_tune.gt_statfs_quantum; statfs_slow = sdp->sd_tune.gt_statfs_slow; spin_unlock(&sdp->sd_tune.gt_spin); if (is_subdir(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_puts(s, ",spectator"); if (args->ar_localflocks) seq_puts(s, ",localflocks"); if (args->ar_debug) seq_puts(s, ",debug"); if (args->ar_posix_acl) seq_puts(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { char *state; switch (args->ar_quota) { case GFS2_QUOTA_OFF: state = "off"; break; case GFS2_QUOTA_ACCOUNT: state = "account"; break; case GFS2_QUOTA_ON: state = "on"; break; case GFS2_QUOTA_QUIET: state = "quiet"; break; default: state = "unknown"; break; } seq_printf(s, ",quota=%s", state); } if (args->ar_suiddir) seq_puts(s, ",suiddir"); if (args->ar_data != GFS2_DATA_DEFAULT) { char *state; switch (args->ar_data) { case GFS2_DATA_WRITEBACK: state = "writeback"; break; case GFS2_DATA_ORDERED: state = "ordered"; break; default: state = "unknown"; break; } seq_printf(s, ",data=%s", state); } if (args->ar_discard) seq_puts(s, ",discard"); if (logd_secs != 30) seq_printf(s, ",commit=%d", logd_secs); if (statfs_quantum != 30) seq_printf(s, ",statfs_quantum=%d", statfs_quantum); else if (statfs_slow) seq_puts(s, ",statfs_quantum=0"); if (quota_quantum != 60) seq_printf(s, ",quota_quantum=%d", quota_quantum); if (args->ar_statfs_percent) seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); if (args->ar_errors != GFS2_ERRORS_DEFAULT) { const char *state; switch (args->ar_errors) { case GFS2_ERRORS_WITHDRAW: state = "withdraw"; break; case GFS2_ERRORS_DEACTIVATE: state = "deactivate"; break; case GFS2_ERRORS_PANIC: state = "panic"; break; default: state = "unknown"; break; } seq_printf(s, ",errors=%s", state); } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) seq_puts(s, ",nobarrier"); if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) seq_puts(s, ",rgrplvb"); if (args->ar_loccookie) seq_puts(s, ",loccookie"); return 0; } /** * gfs2_glock_put_eventually * @gl: The glock to put * * When under memory pressure, trigger a deferred glock put to make sure we * won't call into DLM and deadlock. Otherwise, put the glock directly. */ static void gfs2_glock_put_eventually(struct gfs2_glock *gl) { if (current->flags & PF_MEMALLOC) gfs2_glock_put_async(gl); else gfs2_glock_put(gl); } static enum evict_behavior gfs2_upgrade_iopen_glock(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder *gh = &ip->i_iopen_gh; int error; gh->gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(gh); /* * If there are no other lock holders, we will immediately get * exclusive access to the iopen glock here. * * Otherwise, the other nodes holding the lock will be notified about * our locking request (see iopen_go_callback()). If they do not have * the inode open, they are expected to evict the cached inode and * release the lock, allowing us to proceed. * * Otherwise, if they cannot evict the inode, they are expected to poke * the inode glock (note: not the iopen glock). We will notice that * and stop waiting for the iopen glock immediately. The other node(s) * are then expected to take care of deleting the inode when they no * longer use it. * * As a last resort, if another node keeps holding the iopen glock * without showing any activity on the inode glock, we will eventually * time out and fail the iopen glock upgrade. */ gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); error = gfs2_glock_nq(gh); if (error) return EVICT_SHOULD_SKIP_DELETE; wait_event_interruptible_timeout(sdp->sd_async_glock_wait, !test_bit(HIF_WAIT, &gh->gh_iflags) || glock_needs_demote(ip->i_gl), 5 * HZ); if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_dq(gh); if (glock_needs_demote(ip->i_gl)) return EVICT_SHOULD_SKIP_DELETE; return EVICT_SHOULD_DEFER_DELETE; } error = gfs2_glock_holder_ready(gh); if (error) return EVICT_SHOULD_SKIP_DELETE; return EVICT_SHOULD_DELETE; } /** * evict_should_delete - determine whether the inode is eligible for deletion * @inode: The inode to evict * @gh: The glock holder structure * * This function determines whether the evicted inode is eligible to be deleted * and locks the inode glock. * * Returns: the fate of the dinode */ static enum evict_behavior evict_should_delete(struct inode *inode, struct gfs2_holder *gh) { struct gfs2_inode *ip = GFS2_I(inode); struct super_block *sb = inode->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; int ret; if (inode->i_nlink) return EVICT_SHOULD_SKIP_DELETE; if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags)) return EVICT_SHOULD_DEFER_DELETE; /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return EVICT_SHOULD_DEFER_DELETE; /* Must not read inode block until block type has been verified */ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); if (unlikely(ret)) return EVICT_SHOULD_SKIP_DELETE; if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) return EVICT_SHOULD_SKIP_DELETE; ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (ret) return EVICT_SHOULD_SKIP_DELETE; ret = gfs2_instantiate(gh); if (ret) return EVICT_SHOULD_SKIP_DELETE; /* * The inode may have been recreated in the meantime. */ if (inode->i_nlink) return EVICT_SHOULD_SKIP_DELETE; if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) return gfs2_upgrade_iopen_glock(inode); return EVICT_SHOULD_DELETE; } /** * evict_unlinked_inode - delete the pieces of an unlinked evicted inode * @inode: The inode to evict * @gh: The glock holder structure */ static int evict_unlinked_inode(struct inode *inode, struct gfs2_holder *gh) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *gl = ip->i_gl; int ret; /* The inode glock must be held exclusively and be instantiated. */ BUG_ON(!gfs2_holder_initialized(gh) || test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)); if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { ret = gfs2_dir_exhash_dealloc(ip); if (ret) goto out; } if (ip->i_eattr) { ret = gfs2_ea_dealloc(ip, true); if (ret) goto out; } if (!gfs2_is_stuffed(ip)) { ret = gfs2_file_dealloc(ip); if (ret) goto out; } /* * As soon as we clear the bitmap for the dinode, gfs2_create_inode() * can get called to recreate it, or even gfs2_inode_lookup() if the * inode was recreated on another node in the meantime. * * However, inserting the new inode into the inode hash table will not * succeed until the old inode is removed, and that only happens after * ->evict_inode() returns. The new inode is attached to its inode and * iopen glocks after inserting it into the inode hash table, so at * that point we can be sure that both glocks are unused. */ ret = gfs2_dinode_dealloc(ip); if (!ret) gfs2_inode_remember_delete(gl, ip->i_no_formal_ino); out: return ret; } static int gfs2_truncate_inode_pages(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = &inode->i_data; bool need_trans = gfs2_is_jdata(ip) && mapping->nrpages; int ret = 0; /* * Truncating a jdata inode address space may create revokes in * truncate_inode_pages() -> gfs2_invalidate_folio() -> ... -> * gfs2_remove_from_journal(), so we need a transaction here. * * During a withdraw, no new transactions can be created. We still * take the log flush lock to prevent truncate from racing with * gfs2_log_flush(). */ if (need_trans) { ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (ret) down_read(&sdp->sd_log_flush_lock); } truncate_inode_pages(mapping, 0); if (need_trans) { if (ret) up_read(&sdp->sd_log_flush_lock); else gfs2_trans_end(sdp); } return ret; } static void gfs2_truncate_inode_pages_final(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = &inode->i_data; bool need_lock = gfs2_is_jdata(ip) && mapping->nrpages; if (need_lock) down_read(&sdp->sd_log_flush_lock); truncate_inode_pages_final(mapping); if (need_lock) up_read(&sdp->sd_log_flush_lock); } /* * evict_linked_inode - evict an inode whose dinode has not been unlinked * @inode: The inode to evict * @gh: The glock holder structure */ static int evict_linked_inode(struct inode *inode, struct gfs2_holder *gh) { struct super_block *sb = inode->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *gl = ip->i_gl; struct address_space *metamapping = gfs2_glock2aspace(gl); int ret; if (!(test_bit(GLF_DIRTY, &gl->gl_flags) || inode->i_flags & I_DIRTY)) goto clean; /* The inode glock must be held exclusively and be instantiated. */ if (!gfs2_holder_initialized(gh)) ret = gfs2_glock_nq_init(gl, LM_ST_EXCLUSIVE, 0, gh); else ret = gfs2_instantiate(gh); if (ret) return ret; gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_EVICT_INODE); if (test_bit(GLF_DIRTY, &gl->gl_flags)) { filemap_fdatawrite(metamapping); filemap_fdatawait(metamapping); } write_inode_now(inode, 1); gfs2_ail_flush(gl, 0); clean: ret = gfs2_truncate_inode_pages(inode); truncate_inode_pages(metamapping, 0); return ret; } /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict * * There are three cases to consider: * 1. i_nlink == 0, we are final opener (and must deallocate) * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) * 3. i_nlink > 0 * * If the fs is read only, then we have to treat all cases as per #3 * since we are unable to do any deallocation. The inode will be * deallocated by the next read/write node to attempt an allocation * in the same resource group * * We have to (at the moment) hold the inodes main lock to cover * the gap between unlocking the shared lock on the iopen lock and * taking the exclusive lock. I'd rather do a shared -> exclusive * conversion on the iopen lock, but we can change that later. This * is safe, just less efficient. */ static void gfs2_evict_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; enum evict_behavior behavior; int ret; gfs2_holder_mark_uninitialized(&gh); if (sb_rdonly(sb) || !ip->i_no_addr || !ip->i_gl) goto out; /* * In case of an incomplete mount, gfs2_evict_inode() may be called for * system files without having an active journal to write to. In that * case, skip the filesystem evict. */ if (!sdp->sd_jdesc) goto out; behavior = evict_should_delete(inode, &gh); if (behavior == EVICT_SHOULD_DEFER_DELETE && !test_bit(SDF_KILL, &sdp->sd_flags)) { struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl; if (io_gl) { gfs2_glock_hold(io_gl); if (!gfs2_queue_verify_delete(io_gl, true)) gfs2_glock_put(io_gl); goto out; } behavior = EVICT_SHOULD_SKIP_DELETE; } if (behavior == EVICT_SHOULD_DELETE) ret = evict_unlinked_inode(inode, &gh); else ret = evict_linked_inode(inode, &gh); if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); if (ret && !gfs2_withdrawn(sdp) && ret != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); gfs2_truncate_inode_pages_final(inode); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; glock_clear_object(gl, ip); gfs2_glock_hold(gl); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_put_eventually(ip->i_gl); rcu_assign_pointer(ip->i_gl, NULL); } } static struct inode *gfs2_alloc_inode(struct super_block *sb) { struct gfs2_inode *ip; ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); if (!ip) return NULL; ip->i_no_addr = 0; ip->i_no_formal_ino = 0; ip->i_flags = 0; ip->i_gl = NULL; gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_diskflags = 0; ip->i_rahead = 0; return &ip->i_inode; } static void gfs2_free_inode(struct inode *inode) { kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } void free_local_statfs_inodes(struct gfs2_sbd *sdp) { struct local_statfs_inode *lsi, *safe; /* Run through the statfs inodes list to iput and free memory */ list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { if (lsi->si_jid == sdp->sd_jdesc->jd_jid) sdp->sd_sc_inode = NULL; /* belongs to this node */ if (lsi->si_sc_inode) iput(lsi->si_sc_inode); list_del(&lsi->si_list); kfree(lsi); } } struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, unsigned int index) { struct local_statfs_inode *lsi; /* Return the local (per node) statfs inode in the * sdp->sd_sc_inodes_list corresponding to the 'index'. */ list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) { if (lsi->si_jid == index) return lsi->si_sc_inode; } return NULL; } const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .free_inode = gfs2_free_inode, .write_inode = gfs2_write_inode, .dirty_inode = gfs2_dirty_inode, .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, .freeze_super = gfs2_freeze_super, .freeze_fs = gfs2_freeze_fs, .thaw_super = gfs2_thaw_super, .statfs = gfs2_statfs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, };
86 86 86 1 1 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 // SPDX-License-Identifier: GPL-2.0-or-later /* * Flexible mmap layout support * * Based on code by Ingo Molnar and Andi Kleen, copyrighted * as follows: * * Copyright 2003-2009 Red Hat Inc. * All Rights Reserved. * Copyright 2005 Andi Kleen, SUSE Labs. * Copyright 2007 Jiri Kosina, SUSE Labs. */ #include <linux/personality.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/limits.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/compat.h> #include <linux/elf-randomize.h> #include <asm/elf.h> #include <asm/io.h> #include "physaddr.h" struct va_alignment __read_mostly va_align = { .flags = -1, }; unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } unsigned long task_size_64bit(int full_addr_space) { return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if (current->flags & PF_RANDOMIZE) { max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } return max; } #ifdef CONFIG_COMPAT # define mmap32_rnd_bits mmap_rnd_compat_bits # define mmap64_rnd_bits mmap_rnd_bits #else # define mmap32_rnd_bits mmap_rnd_bits # define mmap64_rnd_bits mmap_rnd_bits #endif #define SIZE_128M (128 * 1024 * 1024UL) static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; return sysctl_legacy_va_layout; } static unsigned long arch_rnd(unsigned int rndbits) { if (!(current->flags & PF_RANDOMIZE)) return 0; return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; } unsigned long arch_mmap_rnd(void) { return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; /* * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ gap = clamp(gap, SIZE_128M, (task_size / 6) * 5); return PAGE_ALIGN(task_size - gap - rnd); } static unsigned long mmap_legacy_base(unsigned long rnd, unsigned long task_size) { return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, unsigned long random_factor, unsigned long task_size, const struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) *base = *legacy_base; else *base = mmap_base(random_factor, task_size, rlim_stack); } void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm_flags_clear(MMF_TOPDOWN, mm); else mm_flags_set(MMF_TOPDOWN, mm); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), rlim_stack); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* * The mmap syscall mapping base decision depends solely on the * syscall type (64-bit or compat). This applies for 64bit * applications and 32bit applications. The 64bit syscall uses * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, arch_rnd(mmap32_rnd_bits), task_size_32bit(), rlim_stack); #endif } unsigned long get_mmap_base(int is_legacy) { struct mm_struct *mm = current->mm; #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES if (in_32bit_syscall()) { return is_legacy ? mm->mmap_compat_legacy_base : mm->mmap_compat_base; } #endif return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } /** * mmap_address_hint_valid - Validate the address hint of mmap * @addr: Address hint * @len: Mapping length * * Check whether @addr and @addr + @len result in a valid mapping. * * On 32bit this only checks whether @addr + @len is <= TASK_SIZE. * * On 64bit with 5-level page tables another sanity check is required * because mappings requested by mmap(@addr, 0) which cross the 47-bit * virtual address boundary can cause the following theoretical issue: * * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr * is below the border of the 47-bit address space and @addr + @len is * above the border. * * With 4-level paging this request succeeds, but the resulting mapping * address will always be within the 47-bit virtual address space, because * the hint address does not result in a valid mapping and is * ignored. Hence applications which are not prepared to handle virtual * addresses above 47-bit work correctly. * * With 5-level paging this request would be granted and result in a * mapping which crosses the border of the 47-bit virtual address * space. If the application cannot handle addresses above 47-bit this * will lead to misbehaviour and hard to diagnose failures. * * Therefore ignore address hints which would result in a mapping crossing * the 47-bit virtual address boundary. * * Note, that in the same scenario with MAP_FIXED the behaviour is * different. The request with @addr < 47-bit and @addr + @len > 47-bit * fails on a 4-level paging machine but succeeds on a 5-level paging * machine. It is reasonable to expect that an application does not rely on * the failure of such a fixed mapping request, so the restriction is not * applied. */ bool mmap_address_hint_valid(unsigned long addr, unsigned long len) { if (TASK_SIZE - len < addr) return false; return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW); } /* Can we access it for direct reading/writing? Must be RAM: */ int valid_phys_addr_range(phys_addr_t addr, size_t count) { return addr + count - 1 <= __pa(high_memory - 1); } /* Can we access it through mmap? Must be a valid physical address: */ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) { phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT; return phys_addr_valid(addr + count - 1); } /* * Only allow root to set high MMIO mappings to PROT_NONE. * This prevents an unpriv. user to set them to PROT_NONE and invert * them, then pointing to valid memory for L1TF speculation. * * Note: for locked down kernels may want to disable the root override. */ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { if (!boot_cpu_has_bug(X86_BUG_L1TF)) return true; if (!__pte_needs_invert(pgprot_val(prot))) return true; /* If it's real memory always allow */ if (pfn_valid(pfn)) return true; if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) return false; return true; }
253 1593 7176 1049 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for bit * locking operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H #include <linux/instrumented.h> /** * clear_bit_unlock - Clear a bit in memory, for unlock * @nr: the bit to set * @addr: the address to start counting from * * This operation is atomic and provides release barrier semantics. */ static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { kcsan_release(); instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit_unlock(nr, addr); } /** * __clear_bit_unlock - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a non-atomic operation but implies a release barrier before the * memory operation. It can be used for an unlock if no other CPUs can * concurrently modify other bits in the word. */ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { kcsan_release(); instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit_unlock(nr, addr); } /** * test_and_set_bit_lock - Set a bit and return its old value, for lock * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and provides acquire barrier semantics if * the returned value is 0. * It can be used to implement bit locks. */ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) { instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit_lock(nr, addr); } /** * xor_unlock_is_negative_byte - XOR a single byte in memory and test if * it is negative, for unlock. * @mask: Change the bits which are set in this mask. * @addr: The address of the word containing the byte to change. * * Changes some of bits 0-6 in the word pointed to by @addr. * This operation is atomic and provides release barrier semantics. * Used to optimise some folio operations which are commonly paired * with an unlock or end of writeback. Bit 7 is used as PG_waiters to * indicate whether anybody is waiting for the unlock. * * Return: Whether the top bit of the byte is set. */ static inline bool xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { kcsan_release(); instrument_atomic_write(addr, sizeof(long)); return arch_xor_unlock_is_negative_byte(mask, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */
868 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TASK_WORK_H #define _LINUX_TASK_WORK_H #include <linux/list.h> #include <linux/sched.h> typedef void (*task_work_func_t)(struct callback_head *); static inline void init_task_work(struct callback_head *twork, task_work_func_t func) { twork->func = func; } enum task_work_notify_mode { TWA_NONE = 0, TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, }; static inline bool task_work_pending(struct task_struct *task) { return READ_ONCE(task->task_works); } int task_work_add(struct task_struct *task, struct callback_head *twork, enum task_work_notify_mode mode); struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); bool task_work_cancel(struct task_struct *task, struct callback_head *cb); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) { task_work_run(); } #endif /* _LINUX_TASK_WORK_H */
4 4 2 2 4 2 1 1 1 1 161 773 160 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module");
3 3 3 3 3 3 2 3 3 3 3 3 3 3 1 1 1 1 1 3 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> */ #include <linux/filter.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> #include <net/flow.h> #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip.h> #include <net/ip6_route.h> struct bpf_lwt_prog { struct bpf_prog *prog; char *name; }; struct bpf_lwt { struct bpf_lwt_prog in; struct bpf_lwt_prog out; struct bpf_lwt_prog xmit; int family; }; #define MAX_PROG_NAME 256 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct bpf_lwt *)lwt->data; } #define NO_REDIRECT false #define CAN_REDIRECT true static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; /* Disabling BH is needed to protect per-CPU bpf_redirect_info between * BPF prog and skb_do_redirect(). */ local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); switch (ret) { case BPF_OK: case BPF_LWT_REROUTE: break; case BPF_REDIRECT: if (unlikely(!can_redirect)) { pr_warn_once("Illegal redirect return code in prog %s\n", lwt->name ? : "<unknown>"); ret = BPF_OK; } else { skb_reset_mac_header(skb); skb_do_redirect(skb); ret = BPF_REDIRECT; } break; case BPF_DROP: kfree_skb(skb); ret = -EPERM; break; default: pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); kfree_skb(skb); ret = -EINVAL; break; } bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return ret; } static int bpf_lwt_input_reroute(struct sk_buff *skb) { enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); if (IS_ENABLED(CONFIG_IPV6)) { ip6_route_input(skb); err = skb_dst(skb)->error; } else { err = -EAFNOSUPPORT; } } else { err = -EAFNOSUPPORT; } if (err) goto err; return dst_input(skb); err: kfree_skb(skb); return err; } static int bpf_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->in.prog) { ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); if (ret < 0) return ret; if (ret == BPF_LWT_REROUTE) return bpf_lwt_input_reroute(skb); } if (unlikely(!dst->lwtstate->orig_input)) { kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_input(skb); } static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->out.prog) { ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); if (ret < 0) return ret; } if (unlikely(!dst->lwtstate->orig_output)) { pr_warn_once("orig_output not set on dst for prog %s\n", bpf->out.name); kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_output(net, sk, skb); } static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) return -ENOMEM; } return 0; } static int bpf_lwt_xmit_reroute(struct sk_buff *skb) { struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); int oif = l3mdev ? l3mdev->ifindex : 0; struct dst_entry *dst = NULL; int err = -EAFNOSUPPORT; struct sock *sk; struct net *net; bool ipv4; if (skb->protocol == htons(ETH_P_IP)) ipv4 = true; else if (skb->protocol == htons(ETH_P_IPV6)) ipv4 = false; else goto err; sk = sk_to_full_sk(skb->sk); if (sk) { if (sk->sk_bound_dev_if) oif = sk->sk_bound_dev_if; net = sock_net(sk); } else { net = dev_net(skb_dst(skb)->dev); } if (ipv4) { struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = {}; struct rtable *rt; fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; fl4.saddr = iph->saddr; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto err; } dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); struct flowi6 fl6 = {}; fl6.flowi6_oif = oif; fl6.flowi6_mark = skb->mark; fl6.flowi6_uid = sock_net_uid(net, sk); fl6.flowlabel = ip6_flowinfo(iph6); fl6.flowi6_proto = iph6->nexthdr; fl6.daddr = iph6->daddr; fl6.saddr = iph6->saddr; dst = ip6_dst_lookup_flow(net, skb->sk, &fl6, NULL); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err; } } if (unlikely(dst->error)) { err = dst->error; dst_release(dst); goto err; } /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it * was done for the previous dst, so we are doing it here again, in * case the new dst needs much more space. The call below is a noop * if there is enough header space in skb. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto err; skb_dst_drop(skb); skb_dst_set(skb, dst); err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; err: kfree_skb(skb); return err; } static int bpf_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); switch (ret) { case BPF_OK: /* If the header changed, e.g. via bpf_lwt_push_encap, * BPF_LWT_REROUTE below should have been used if the * protocol was also changed. */ if (skb->protocol != proto) { kfree_skb(skb); return -EINVAL; } /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; return LWTUNNEL_XMIT_CONTINUE; case BPF_REDIRECT: return LWTUNNEL_XMIT_DONE; case BPF_LWT_REROUTE: return bpf_lwt_xmit_reroute(skb); default: return ret; } } return LWTUNNEL_XMIT_CONTINUE; } static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) { if (prog->prog) bpf_prog_put(prog->prog); kfree(prog->name); } static void bpf_destroy_state(struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); bpf_lwt_prog_destroy(&bpf->in); bpf_lwt_prog_destroy(&bpf->out); bpf_lwt_prog_destroy(&bpf->xmit); } static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, .len = MAX_PROG_NAME }, }; static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, enum bpf_prog_type type) { struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; struct bpf_prog *p; int ret; u32 fd; ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, NULL); if (ret < 0) return ret; if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) return -EINVAL; prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); if (!prog->name) return -ENOMEM; fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); p = bpf_prog_get_type(fd, type); if (IS_ERR(p)) return PTR_ERR(p); prog->prog = p; return 0; } static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_IN] = { .type = NLA_NESTED, }, [LWT_BPF_OUT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; static int bpf_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; struct bpf_lwt *bpf; int ret; if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) return -EINVAL; newts = lwtunnel_state_alloc(sizeof(*bpf)); if (!newts) return -ENOMEM; newts->type = LWTUNNEL_ENCAP_BPF; bpf = bpf_lwt_lwtunnel(newts); if (tb[LWT_BPF_IN]) { newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, BPF_PROG_TYPE_LWT_IN); if (ret < 0) goto errout; } if (tb[LWT_BPF_OUT]) { newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, BPF_PROG_TYPE_LWT_OUT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT]) { newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, BPF_PROG_TYPE_LWT_XMIT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT_HEADROOM]) { u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); if (headroom > LWT_BPF_MAX_HEADROOM) { ret = -ERANGE; goto errout; } newts->headroom = headroom; } bpf->family = family; *ts = newts; return 0; errout: bpf_destroy_state(newts); kfree(newts); return ret; } static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, struct bpf_lwt_prog *prog) { struct nlattr *nest; if (!prog->prog) return 0; nest = nla_nest_start_noflag(skb, attr); if (!nest) return -EMSGSIZE; if (prog->name && nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) return -EMSGSIZE; return nla_nest_end(skb, nest); } static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) return -EMSGSIZE; return 0; } static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) { int nest_len = nla_total_size(sizeof(struct nlattr)) + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 0; return nest_len + /* LWT_BPF_IN */ nest_len + /* LWT_BPF_OUT */ nest_len + /* LWT_BPF_XMIT */ 0; } static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which * results in a new bpf_prog instance. Comparing names for now. */ if (!a->name && !b->name) return 0; if (!a->name || !b->name) return 1; return strcmp(a->name, b->name); } static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); } static const struct lwtunnel_encap_ops bpf_encap_ops = { .build_state = bpf_build_state, .destroy_state = bpf_destroy_state, .input = bpf_input, .output = bpf_output, .xmit = bpf_xmit, .fill_encap = bpf_fill_encap_info, .get_encap_size = bpf_encap_nlsize, .cmp_encap = bpf_encap_cmp, .owner = THIS_MODULE, }; static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, int encap_len) { struct skb_shared_info *shinfo = skb_shinfo(skb); gso_type |= SKB_GSO_DODGY; shinfo->gso_type |= gso_type; skb_decrease_gso_size(shinfo, encap_len); shinfo->gso_segs = 0; return 0; } static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) { int next_hdr_offset; void *next_hdr; __u8 protocol; /* SCTP and UDP_L4 gso need more nuanced handling than what * handle_gso_type() does above: skb_decrease_gso_size() is not enough. * So at the moment only TCP GSO packets are let through. */ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) return -ENOTSUPP; if (ipv4) { protocol = ip_hdr(skb)->protocol; next_hdr_offset = sizeof(struct iphdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } else { protocol = ipv6_hdr(skb)->nexthdr; next_hdr_offset = sizeof(struct ipv6hdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } switch (protocol) { case IPPROTO_GRE: next_hdr_offset += sizeof(struct gre_base_hdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) return handle_gso_type(skb, SKB_GSO_GRE_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_GRE, encap_len); case IPPROTO_UDP: next_hdr_offset += sizeof(struct udphdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct udphdr *)next_hdr)->check) return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); case IPPROTO_IP: case IPPROTO_IPV6: if (ipv4) return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); else return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); default: return -EPROTONOSUPPORT; } } int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { struct iphdr *iph; bool ipv4; int err; if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) return -EINVAL; /* validate protocol and length */ iph = (struct iphdr *)hdr; if (iph->version == 4) { ipv4 = true; if (unlikely(len < iph->ihl * 4)) return -EINVAL; } else if (iph->version == 6) { ipv4 = false; if (unlikely(len < sizeof(struct ipv6hdr))) return -EINVAL; } else { return -EINVAL; } if (ingress) err = skb_cow_head(skb, len + skb->mac_len); else err = skb_cow_head(skb, len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); if (unlikely(err)) return err; /* push the encap headers and fix pointers */ skb_reset_inner_headers(skb); skb_reset_inner_mac_header(skb); /* mac header is not yet set */ skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_push(skb, len); if (ingress) skb_postpush_rcsum(skb, iph, len); skb_reset_network_header(skb); memcpy(skb_network_header(skb), hdr, len); bpf_compute_data_pointers(skb); skb_clear_hash(skb); if (ipv4) { skb->protocol = htons(ETH_P_IP); iph = ip_hdr(skb); if (!iph->check) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } else { skb->protocol = htons(ETH_P_IPV6); } if (skb_is_gso(skb)) return handle_gso_encap(skb, ipv4, len); return 0; } static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); } subsys_initcall(bpf_lwt_init)
35 35 35 35 42 44 45 45 45 45 45 45 44 40 39 39 38 40 38 40 44 45 45 44 44 45 1 1 44 44 44 44 23 44 44 44 44 1 1 44 44 44 44 43 44 44 3 44 44 44 43 44 45 45 45 45 43 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 // SPDX-License-Identifier: GPL-2.0 /* * fs/mpage.c * * Copyright (C) 2002, Linus Torvalds. * * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include "internal.h" /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_read_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, err == 0); bio_put(bio); } static void mpage_write_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) { if (err) mapping_set_error(fi.folio->mapping, err); folio_end_writeback(fi.folio); } bio_put(bio); } static struct bio *mpage_bio_submit_read(struct bio *bio) { bio->bi_end_io = mpage_read_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } static struct bio *mpage_bio_submit_write(struct bio *bio) { bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } /* * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows read_folio to avoid triggering a duplicate call * to get_block. * * The idea is to avoid adding buffers to pages that don't already have * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, int page_block) { struct inode *inode = folio->mapping->host; struct buffer_head *page_bh, *head; int block = 0; head = folio_buffers(folio); if (!head) { /* * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ if (inode->i_blkbits == folio_shift(folio) && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; } head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; do { if (block == page_block) { page_bh->b_state = bh->b_state; page_bh->b_bdev = bh->b_bdev; page_bh->b_blocknr = bh->b_blocknr; break; } page_bh = page_bh->b_this_page; block++; } while (page_bh != head); } struct mpage_readpage_args { struct bio *bio; struct folio *folio; unsigned int nr_pages; bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; }; /* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static void do_mpage_readpage(struct mpage_readpage_args *args) { struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_folio = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; unsigned first_hole = blocks_per_folio; struct block_device *bdev = NULL; int length; int fully_mapped = 1; blk_opf_t opf = REQ_OP_READ; unsigned nblocks; unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; } if (folio_buffers(folio)) goto confused; block_in_file = folio_pos(folio) >> blkbits; last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; if (buffer_mapped(map_bh) && block_in_file > args->first_logical_block && block_in_file < (args->first_logical_block + nblocks)) { unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; first_block = map_bh->b_blocknr + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; while (page_block < blocks_per_folio) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (args->get_block(inode, block_in_file, map_bh, 0)) goto confused; args->first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_folio copies the data * we just collected from get_block into the folio's buffers * so read_folio doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { map_buffer_to_folio(folio, map_bh, page_block); goto confused; } if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map_bh->b_blocknr; else if (first_block + page_block != map_bh->b_blocknr) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); goto out; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != first_block - 1)) args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) goto confused; args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); } length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_folio)) args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = first_block + blocks_per_folio - 1; out: return; confused: if (args->bio) args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else folio_unlock(folio); goto out; } /** * mpage_readahead - start reads against pages * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and * emitting large BIOs. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * * BH_Boundary explanation: * * There is a problem. The mpage read code assembles several pages, gets all * their disk mappings, and then submits them all. That's fine, but obtaining * the disk mappings may require I/O. Reads of indirect blocks, for example. * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks * 13,14,15,16. Obviously, this impacts performance. * * So what we do it to allow the filesystem's get_block() function to set * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block * after this one will require I/O against a block which is probably close to * this one. So you should push what I/O you have currently accumulated. * * This all causes the disk requests to be issued in the correct order. */ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { struct folio *folio; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; while ((folio = readahead_folio(rac))) { prefetchw(&folio->flags); args.folio = folio; args.nr_pages = readahead_count(rac); do_mpage_readpage(&args); /* * If read ahead failed synchronously, it may cause by removed * device, or some filesystem metadata error. */ if (!folio_test_locked(folio) && !folio_test_uptodate(folio)) break; } if (args.bio) mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all */ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, .nr_pages = folio_nr_pages(folio), .get_block = get_block, }; do_mpage_readpage(&args); if (args.bio) mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); /* * Writing is not so simple. * * If the page has buffers then they will be used for obtaining the disk * mapping. We only support pages which are fully mapped-and-dirty, with a * special case for pages which are unmapped at the end: end-of-file. * * If the page has no buffers (preferred) then the page is mapped here. * * If all blocks are found to be contiguous then the page can go into the * BIO. Otherwise fall back to the mapping's writepage(). * * FIXME: This code wants an estimate of how many pages are still to be * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; }; /* * We have our BIO, so we can now mark the buffers clean. Make * sure to only clean buffers which we know we'll be writing. */ static void clean_buffers(struct folio *folio, unsigned first_unmapped) { unsigned buffer_counter = 0; struct buffer_head *bh, *head = folio_buffers(folio); if (!head) return; bh = head; do { if (buffer_counter++ == first_unmapped) break; clear_buffer_dirty(bh); bh = bh->b_this_page; } while (bh != head); /* * we cannot drop the bh if the page is not uptodate or a concurrent * read_folio would fail to serialize with the bh and it would read from * disk before we reach the platter. */ if (buffer_heads_over_limit && folio_test_uptodate(folio)) try_to_free_buffers(folio); } static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, struct mpage_data *mpd) { struct bio *bio = mpd->bio; struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_folio = folio_size(folio) >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; unsigned first_unmapped = blocks_per_folio; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; struct buffer_head *head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ page_block = 0; do { BUG_ON(buffer_locked(bh)); if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by * block_dirty_folio -> mmapped data */ if (buffer_dirty(bh)) goto confused; if (first_unmapped == blocks_per_folio) first_unmapped = page_block; continue; } if (first_unmapped != blocks_per_folio) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { if (bh->b_blocknr != first_block + page_block) goto confused; } else { first_block = bh->b_blocknr; } page_block++; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; boundary_bdev = bh->b_bdev; } bdev = bh->b_bdev; } while ((bh = bh->b_this_page) != head); if (first_unmapped) goto page_is_mapped; /* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_folio(). If this address_space is also * using mpage_readahead then this can rarely happen. */ goto confused; } /* * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); block_in_file = folio_pos(folio) >> blkbits; /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. */ if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_folio; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (!buffer_mapped(&map_bh)) goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; } if (page_block) { if (map_bh.b_blocknr != first_block + page_block) goto confused; } else { first_block = map_bh.b_blocknr; } page_block++; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) break; block_in_file++; } BUG_ON(page_block == 0); first_unmapped = page_block; page_is_mapped: /* Don't bother writing beyond EOF, truncate will discard the folio */ if (folio_pos(folio) >= i_size) goto confused; length = folio_size(folio); if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */ length = i_size - folio_pos(folio); folio_zero_segment(folio, length, folio_size(folio)); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != first_block - 1) bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; } /* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); goto alloc_new; } clean_buffers(folio, first_unmapped); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_folio)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { mpd->last_block_in_bio = first_block + blocks_per_folio - 1; } goto out; confused: if (bio) bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable */ ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; return ret; } /** * __mpage_writepages - walk the list of dirty pages of the given address space * & writepage() all of them * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. * @write_folio: handler to call for each folio before calling * mpage_write_folio() * * This is a library function, which implements the writepages() * address_space_operation. It calls @write_folio handler for each folio. If * the handler returns value > 0, it calls mpage_write_folio() to do the * folio writeback. */ int __mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block, int (*write_folio)(struct folio *folio, struct writeback_control *wbc)) { struct mpage_data mpd = { .get_block = get_block, }; struct folio *folio = NULL; struct blk_plug plug; int error; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &error))) { if (write_folio) { error = write_folio(folio, wbc); /* * == 0 means folio is handled, < 0 means error. In * both cases hand back control to writeback_iter() */ if (error <= 0) continue; /* Let mpage_write_folio() handle the folio. */ } error = mpage_write_folio(wbc, folio, &mpd); } if (mpd.bio) mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return error; } EXPORT_SYMBOL(__mpage_writepages);
18 12 18 17 17 18 4 18 16 15 15 15 5 2 4 5 5 18 18 18 18 1 18 1 17 3 14 12 14 18 47 46 46 1 2 2 1 1 1 1 1 54 51 54 53 48 48 49 2 49 3 3 3 1 1 54 18 18 18 2 2 18 18 5 17 2 16 18 5 15 15 1 1 1 1 1 13 183 179 182 181 179 180 181 183 12 11 11 11 3 3 9 9 1 1 2 2 29 30 30 134 139 1 134 133 140 13 141 137 98 137 136 133 135 137 136 99 99 96 100 101 143 138 137 139 13 142 140 140 135 140 3 3 3 3 2 120 130 120 120 120 1 119 120 83 29 29 29 29 22 1 28 27 27 27 3 26 27 36 12 1 33 11 11 34 1 32 9 5 5 33 10 1 10 33 31 2 2 1 30 5 31 31 31 1 31 1 1 1 1 30 30 30 1 1 29 7 29 30 23 27 30 29 27 141 143 143 2 1 2 2 2 2 145 144 144 142 141 139 139 138 139 13 139 137 138 2 137 13 134 140 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 // SPDX-License-Identifier: GPL-2.0-or-later /* * af_alg: User-space algorithm interface * * This file provides the user-space API for algorithms. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/atomic.h> #include <crypto/if_alg.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/key.h> #include <linux/key-type.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/security.h> #include <linux/string.h> #include <keys/user-type.h> #include <keys/trusted-type.h> #include <keys/encrypted-type.h> struct alg_type_list { const struct af_alg_type *type; struct list_head list; }; static struct proto alg_proto = { .name = "ALG", .owner = THIS_MODULE, .obj_size = sizeof(struct alg_sock), }; static LIST_HEAD(alg_types); static DECLARE_RWSEM(alg_types_sem); static const struct af_alg_type *alg_get_type(const char *name) { const struct af_alg_type *type = ERR_PTR(-ENOENT); struct alg_type_list *node; down_read(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, name)) continue; if (try_module_get(node->type->owner)) type = node->type; break; } up_read(&alg_types_sem); return type; } int af_alg_register_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -EEXIST; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (!strcmp(node->type->name, type->name)) goto unlock; } node = kmalloc_obj(*node); err = -ENOMEM; if (!node) goto unlock; type->ops->owner = THIS_MODULE; if (type->ops_nokey) type->ops_nokey->owner = THIS_MODULE; node->type = type; list_add(&node->list, &alg_types); err = 0; unlock: up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_register_type); int af_alg_unregister_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -ENOENT; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, type->name)) continue; list_del(&node->list); kfree(node); err = 0; break; } up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_unregister_type); static void alg_do_release(const struct af_alg_type *type, void *private) { if (!type) return; type->release(private); module_put(type->owner); } int af_alg_release(struct socket *sock) { if (sock->sk) { sock_put(sock->sk); sock->sk = NULL; } return 0; } EXPORT_SYMBOL_GPL(af_alg_release); void af_alg_release_parent(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); unsigned int nokey = atomic_read(&ask->nokey_refcnt); sk = ask->parent; ask = alg_sk(sk); if (nokey) atomic_dec(&ask->nokey_refcnt); if (atomic_dec_and_test(&ask->refcnt)) sock_put(sk); } EXPORT_SYMBOL_GPL(af_alg_release_parent); static int alg_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY; struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct sockaddr_alg_new *sa = (void *)uaddr; const struct af_alg_type *type; void *private; int err; if (sock->state == SS_CONNECTED) return -EINVAL; BUILD_BUG_ON(offsetof(struct sockaddr_alg_new, salg_name) != offsetof(struct sockaddr_alg, salg_name)); BUILD_BUG_ON(offsetof(struct sockaddr_alg, salg_name) != sizeof(*sa)); if (addr_len < sizeof(*sa) + 1) return -EINVAL; /* If caller uses non-allowed flag, return error. */ if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) return -EINVAL; sa->salg_type[sizeof(sa->salg_type) - 1] = 0; sa->salg_name[addr_len - sizeof(*sa) - 1] = 0; type = alg_get_type(sa->salg_type); if (PTR_ERR(type) == -ENOENT) { request_module("algif-%s", sa->salg_type); type = alg_get_type(sa->salg_type); } if (IS_ERR(type)) return PTR_ERR(type); private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask); if (IS_ERR(private)) { module_put(type->owner); return PTR_ERR(private); } err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt)) goto unlock; swap(ask->type, type); swap(ask->private, private); err = 0; unlock: release_sock(sk); alg_do_release(type, private); return err; } static int alg_setkey(struct sock *sk, sockptr_t ukey, unsigned int keylen) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type = ask->type; u8 *key; int err; key = sock_kmalloc(sk, keylen, GFP_KERNEL); if (!key) return -ENOMEM; err = -EFAULT; if (copy_from_sockptr(key, ukey, keylen)) goto out; err = type->setkey(ask->private, key, keylen); out: sock_kzfree_s(sk, key, keylen); return err; } #ifdef CONFIG_KEYS static const u8 *key_data_ptr_user(const struct key *key, unsigned int *datalen) { const struct user_key_payload *ukp; ukp = user_key_payload_locked(key); if (IS_ERR_OR_NULL(ukp)) return ERR_PTR(-EKEYREVOKED); *datalen = key->datalen; return ukp->data; } static const u8 *key_data_ptr_encrypted(const struct key *key, unsigned int *datalen) { const struct encrypted_key_payload *ekp; ekp = dereference_key_locked(key); if (IS_ERR_OR_NULL(ekp)) return ERR_PTR(-EKEYREVOKED); *datalen = ekp->decrypted_datalen; return ekp->decrypted_data; } static const u8 *key_data_ptr_trusted(const struct key *key, unsigned int *datalen) { const struct trusted_key_payload *tkp; tkp = dereference_key_locked(key); if (IS_ERR_OR_NULL(tkp)) return ERR_PTR(-EKEYREVOKED); *datalen = tkp->key_len; return tkp->key; } static struct key *lookup_key(key_serial_t serial) { key_ref_t key_ref; key_ref = lookup_user_key(serial, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) return ERR_CAST(key_ref); return key_ref_to_ptr(key_ref); } static int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { const struct af_alg_type *type = ask->type; u8 *key_data = NULL; unsigned int key_datalen; key_serial_t serial; struct key *key; const u8 *ret; int err; if (optlen != sizeof(serial)) return -EINVAL; if (copy_from_sockptr(&serial, optval, optlen)) return -EFAULT; key = lookup_key(serial); if (IS_ERR(key)) return PTR_ERR(key); down_read(&key->sem); ret = ERR_PTR(-ENOPROTOOPT); if (!strcmp(key->type->name, "user") || !strcmp(key->type->name, "logon")) { ret = key_data_ptr_user(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_ENCRYPTED_KEYS) && !strcmp(key->type->name, "encrypted")) { ret = key_data_ptr_encrypted(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_TRUSTED_KEYS) && !strcmp(key->type->name, "trusted")) { ret = key_data_ptr_trusted(key, &key_datalen); } if (IS_ERR(ret)) { up_read(&key->sem); key_put(key); return PTR_ERR(ret); } key_data = sock_kmemdup(&ask->sk, ret, key_datalen, GFP_KERNEL); if (!key_data) { up_read(&key->sem); key_put(key); return -ENOMEM; } up_read(&key->sem); key_put(key); err = type->setkey(ask->private, key_data, key_datalen); sock_kzfree_s(&ask->sk, key_data, key_datalen); return err; } #else static inline int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif static int alg_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; int err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt) != atomic_read(&ask->nokey_refcnt)) goto unlock; type = ask->type; err = -ENOPROTOOPT; if (level != SOL_ALG || !type) goto unlock; switch (optname) { case ALG_SET_KEY: case ALG_SET_KEY_BY_KEY_SERIAL: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setkey) goto unlock; if (optname == ALG_SET_KEY_BY_KEY_SERIAL) err = alg_setkey_by_key_serial(ask, optval, optlen); else err = alg_setkey(sk, optval, optlen); break; case ALG_SET_AEAD_AUTHSIZE: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setauthsize) goto unlock; err = type->setauthsize(ask->private, optlen); break; case ALG_SET_DRBG_ENTROPY: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setentropy) goto unlock; err = type->setentropy(ask->private, optval, optlen); } unlock: release_sock(sk); return err; } int af_alg_accept(struct sock *sk, struct socket *newsock, struct proto_accept_arg *arg) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; struct sock *sk2; unsigned int nokey; int err; lock_sock(sk); type = ask->type; err = -EINVAL; if (!type) goto unlock; sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, arg->kern); err = -ENOMEM; if (!sk2) goto unlock; sock_init_data(newsock, sk2); security_sock_graft(sk2, newsock); security_sk_clone(sk, sk2); /* * newsock->ops assigned here to allow type->accept call to override * them when required. */ newsock->ops = type->ops; err = type->accept(ask->private, sk2); nokey = err == -ENOKEY; if (nokey && type->accept_nokey) err = type->accept_nokey(ask->private, sk2); if (err) goto unlock; if (atomic_inc_return_relaxed(&ask->refcnt) == 1) sock_hold(sk); if (nokey) { atomic_inc(&ask->nokey_refcnt); atomic_set(&alg_sk(sk2)->nokey_refcnt, 1); } alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; newsock->state = SS_CONNECTED; if (nokey) newsock->ops = type->ops_nokey; err = 0; unlock: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(af_alg_accept); static int alg_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { return af_alg_accept(sock->sk, newsock, arg); } static const struct proto_ops alg_proto_ops = { .family = PF_ALG, .owner = THIS_MODULE, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .bind = alg_bind, .release = af_alg_release, .setsockopt = alg_setsockopt, .accept = alg_accept, }; static void alg_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); alg_do_release(ask->type, ask->private); } static int alg_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int err; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; if (protocol != 0) return -EPROTONOSUPPORT; err = -ENOMEM; sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern); if (!sk) goto out; sock->ops = &alg_proto_ops; sock_init_data(sock, sk); sk->sk_destruct = alg_sock_destruct; return 0; out: return err; } static const struct net_proto_family alg_family = { .family = PF_ALG, .create = alg_create, .owner = THIS_MODULE, }; static void af_alg_link_sg(struct af_alg_sgl *sgl_prev, struct af_alg_sgl *sgl_new) { sg_unmark_end(sgl_prev->sgt.sgl + sgl_prev->sgt.nents - 1); sg_chain(sgl_prev->sgt.sgl, sgl_prev->sgt.nents + 1, sgl_new->sgt.sgl); } void af_alg_free_sg(struct af_alg_sgl *sgl) { int i; if (sgl->sgt.sgl) { if (sgl->need_unpin) for (i = 0; i < sgl->sgt.nents; i++) unpin_user_page(sg_page(&sgl->sgt.sgl[i])); if (sgl->sgt.sgl != sgl->sgl) kvfree(sgl->sgt.sgl); sgl->sgt.sgl = NULL; } } EXPORT_SYMBOL_GPL(af_alg_free_sg); static int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con) { struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_ALG) continue; switch (cmsg->cmsg_type) { case ALG_SET_IV: if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv))) return -EINVAL; con->iv = (void *)CMSG_DATA(cmsg); if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen + sizeof(*con->iv))) return -EINVAL; break; case ALG_SET_OP: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->op = *(u32 *)CMSG_DATA(cmsg); break; case ALG_SET_AEAD_ASSOCLEN: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->aead_assoclen = *(u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } /** * af_alg_alloc_tsgl - allocate the TX SGL * * @sk: socket of connection to user space * Return: 0 upon success, < 0 upon error */ static int af_alg_alloc_tsgl(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg = NULL; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); if (!list_empty(&ctx->tsgl_list)) sg = sgl->sg; if (!sg || sgl->cur >= MAX_SGL_ENTS) { sgl = sock_kmalloc(sk, struct_size(sgl, sg, (MAX_SGL_ENTS + 1)), GFP_KERNEL); if (!sgl) return -ENOMEM; sg_init_table(sgl->sg, MAX_SGL_ENTS + 1); sgl->cur = 0; if (sg) { sg_unmark_end(sg + MAX_SGL_ENTS - 1); sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg); } list_add_tail(&sgl->list, &ctx->tsgl_list); } return 0; } /** * af_alg_count_tsgl - Count number of TX SG entries * * The counting starts from the beginning of the SGL to @bytes. * * @sk: socket of connection to user space * @bytes: Count the number of SG entries holding given number of bytes. * Return: Number of TX SG entries found given the constraints */ unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes) { const struct alg_sock *ask = alg_sk(sk); const struct af_alg_ctx *ctx = ask->private; const struct af_alg_tsgl *sgl; unsigned int i; unsigned int sgl_count = 0; if (!bytes) return 0; list_for_each_entry(sgl, &ctx->tsgl_list, list) { const struct scatterlist *sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { sgl_count++; if (sg[i].length >= bytes) return sgl_count; bytes -= sg[i].length; } } return sgl_count; } EXPORT_SYMBOL_GPL(af_alg_count_tsgl); /** * af_alg_pull_tsgl - Release the specified buffers from TX SGL * * If @dst is non-null, reassign the pages to @dst. The caller must release * the pages. * * @sk: socket of connection to user space * @used: Number of bytes to pull from TX SGL * @dst: If non-NULL, buffer is reassigned to dst SGL instead of releasing. The * caller must release the buffers in dst. */ void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg; unsigned int i, j = 0; while (!list_empty(&ctx->tsgl_list)) { sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list); sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { size_t plen = min_t(size_t, used, sg[i].length); struct page *page = sg_page(sg + i); if (!page) continue; /* * Assumption: caller created af_alg_count_tsgl(len) * SG entries in dst. */ if (dst && plen) { /* reassign page to dst */ get_page(page); sg_set_page(dst + j, page, plen, sg[i].offset); j++; } sg[i].length -= plen; sg[i].offset += plen; used -= plen; ctx->used -= plen; if (sg[i].length) return; put_page(page); sg_assign_page(sg + i, NULL); } list_del(&sgl->list); sock_kfree_s(sk, sgl, struct_size(sgl, sg, MAX_SGL_ENTS + 1)); } if (!ctx->used) ctx->merge = 0; ctx->init = ctx->more; } EXPORT_SYMBOL_GPL(af_alg_pull_tsgl); /** * af_alg_free_areq_sgls - Release TX and RX SGLs of the request * * @areq: Request holding the TX and RX SGL */ static void af_alg_free_areq_sgls(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_rsgl *rsgl, *tmp; struct scatterlist *tsgl; struct scatterlist *sg; unsigned int i; list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) { atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused); af_alg_free_sg(&rsgl->sgl); list_del(&rsgl->list); if (rsgl != &areq->first_rsgl) sock_kfree_s(sk, rsgl, sizeof(*rsgl)); } tsgl = areq->tsgl; if (tsgl) { for_each_sg(tsgl, sg, areq->tsgl_entries, i) { if (!sg_page(sg)) continue; put_page(sg_page(sg)); } sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl)); } } /** * af_alg_wait_for_wmem - wait for availability of writable memory * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * Return: 0 when writable memory is available, < 0 upon error */ static int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int err = -ERESTARTSYS; long timeout; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, af_alg_writable(sk), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); return err; } /** * af_alg_wmem_wakeup - wakeup caller when writable memory is available * * @sk: socket of connection to user space */ void af_alg_wmem_wakeup(struct sock *sk) { struct socket_wq *wq; if (!af_alg_writable(sk)) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); /** * af_alg_wait_for_data - wait for availability of TX data * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * @min: Set to minimum request size if partial requests are allowed. * Return: 0 when writable memory is available, < 0 upon error */ int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; long timeout; int err = -ERESTARTSYS; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, ctx->init && (!ctx->more || (min && ctx->used >= min)), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); return err; } EXPORT_SYMBOL_GPL(af_alg_wait_for_data); /** * af_alg_data_wakeup - wakeup caller when new data can be sent to kernel * * @sk: socket of connection to user space */ static void af_alg_data_wakeup(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct socket_wq *wq; if (!ctx->used) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } /** * af_alg_sendmsg - implementation of sendmsg system call handler * * The sendmsg system call handler obtains the user data and stores it * in ctx->tsgl_list. This implies allocation of the required numbers of * struct af_alg_tsgl. * * In addition, the ctx is filled with the information sent via CMSG. * * @sock: socket of connection to user space * @msg: message from user space * @size: size of message from user space * @ivsize: the size of the IV for the cipher operation to verify that the * user-space-provided IV has the right size * Return: the number of copied data upon success, < 0 upon error */ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct af_alg_control con = {}; long copied = 0; bool enc = false; bool init = false; int err = 0; if (msg->msg_controllen) { err = af_alg_cmsg_send(msg, &con); if (err) return err; init = true; switch (con.op) { case ALG_OP_ENCRYPT: enc = true; break; case ALG_OP_DECRYPT: enc = false; break; default: return -EINVAL; } if (con.iv && con.iv->ivlen != ivsize) return -EINVAL; } lock_sock(sk); if (ctx->write) { release_sock(sk); return -EBUSY; } ctx->write = true; if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; goto unlock; } pr_info_once( "%s sent an empty control message without MSG_MORE.\n", current->comm); } ctx->init = true; if (init) { ctx->enc = enc; if (con.iv) memcpy(ctx->iv, con.iv->iv, ivsize); ctx->aead_assoclen = con.aead_assoclen; } while (size) { struct scatterlist *sg; size_t len = size; ssize_t plen; /* use the existing memory in an allocated page */ if (ctx->merge && !(msg->msg_flags & MSG_SPLICE_PAGES)) { sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg + sgl->cur - 1; len = min_t(size_t, len, PAGE_SIZE - sg->offset - sg->length); err = memcpy_from_msg(page_address(sg_page(sg)) + sg->offset + sg->length, msg, len); if (err) goto unlock; sg->length += len; ctx->merge = (sg->offset + sg->length) & (PAGE_SIZE - 1); ctx->used += len; copied += len; size -= len; continue; } ctx->merge = 0; if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) goto unlock; } /* allocate a new page */ len = min_t(unsigned long, len, af_alg_sndbuf(sk)); err = af_alg_alloc_tsgl(sk); if (err) goto unlock; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg; if (sgl->cur) sg_unmark_end(sg + sgl->cur - 1); if (msg->msg_flags & MSG_SPLICE_PAGES) { struct sg_table sgtable = { .sgl = sg, .nents = sgl->cur, .orig_nents = sgl->cur, }; plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable, MAX_SGL_ENTS - sgl->cur, 0); if (plen < 0) { err = plen; goto unlock; } for (; sgl->cur < sgtable.nents; sgl->cur++) get_page(sg_page(&sg[sgl->cur])); len -= plen; ctx->used += plen; copied += plen; size -= plen; } else { do { struct page *pg; unsigned int i = sgl->cur; plen = min_t(size_t, len, PAGE_SIZE); pg = alloc_page(GFP_KERNEL); if (!pg) { err = -ENOMEM; goto unlock; } sg_assign_page(sg + i, pg); err = memcpy_from_msg( page_address(sg_page(sg + i)), msg, plen); if (err) { __free_page(sg_page(sg + i)); sg_assign_page(sg + i, NULL); goto unlock; } sg[i].length = plen; len -= plen; ctx->used += plen; copied += plen; size -= plen; sgl->cur++; } while (len && sgl->cur < MAX_SGL_ENTS); ctx->merge = plen & (PAGE_SIZE - 1); } if (!size) sg_mark_end(sg + sgl->cur - 1); } err = 0; ctx->more = msg->msg_flags & MSG_MORE; unlock: af_alg_data_wakeup(sk); ctx->write = false; release_sock(sk); return copied ?: err; } EXPORT_SYMBOL_GPL(af_alg_sendmsg); /** * af_alg_free_resources - release resources required for crypto request * @areq: Request holding the TX and RX SGL */ void af_alg_free_resources(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct af_alg_ctx *ctx; af_alg_free_areq_sgls(areq); sock_kfree_s(sk, areq, areq->areqlen); ctx = alg_sk(sk)->private; ctx->inflight = false; } EXPORT_SYMBOL_GPL(af_alg_free_resources); /** * af_alg_async_cb - AIO callback handler * @data: async request completion data * @err: if non-zero, error result to be returned via ki_complete(); * otherwise return the AIO output length via ki_complete(). * * This handler cleans up the struct af_alg_async_req upon completion of the * AIO operation. * * The number of bytes to be generated with the AIO operation must be set * in areq->outlen before the AIO callback handler is invoked. */ void af_alg_async_cb(void *data, int err) { struct af_alg_async_req *areq = data; struct sock *sk = areq->sk; struct kiocb *iocb = areq->iocb; unsigned int resultlen; /* Buffer size written by crypto operation. */ resultlen = areq->outlen; af_alg_free_resources(areq); sock_put(sk); iocb->ki_complete(iocb, err ? err : (int)resultlen); } EXPORT_SYMBOL_GPL(af_alg_async_cb); /** * af_alg_poll - poll system call handler * @file: file pointer * @sock: socket to poll * @wait: poll_table */ __poll_t af_alg_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; __poll_t mask; sock_poll_wait(file, sock, wait); mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; if (af_alg_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req * * @sk: socket of connection to user space * @areqlen: size of struct af_alg_async_req + crypto_*_reqsize * Return: allocated data structure or ERR_PTR upon error */ struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen) { struct af_alg_ctx *ctx = alg_sk(sk)->private; struct af_alg_async_req *areq; /* Only one AIO request can be in flight. */ if (ctx->inflight) return ERR_PTR(-EBUSY); areq = sock_kmalloc(sk, areqlen, GFP_KERNEL); if (unlikely(!areq)) return ERR_PTR(-ENOMEM); memset(areq, 0, areqlen); ctx->inflight = true; areq->areqlen = areqlen; areq->sk = sk; areq->first_rsgl.sgl.sgt.sgl = areq->first_rsgl.sgl.sgl; INIT_LIST_HEAD(&areq->rsgl_list); return areq; } EXPORT_SYMBOL_GPL(af_alg_alloc_areq); /** * af_alg_get_rsgl - create the RX SGL for the output data from the crypto * operation * * @sk: socket of connection to user space * @msg: user space message * @flags: flags used to invoke recvmsg with * @areq: instance of the cryptographic request that will hold the RX SGL * @maxsize: maximum number of bytes to be pulled from user space * @outlen: number of bytes in the RX SGL * Return: 0 on success, < 0 upon error */ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, struct af_alg_async_req *areq, size_t maxsize, size_t *outlen) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; size_t len = 0; while (maxsize > len && msg_data_left(msg)) { struct af_alg_rsgl *rsgl; ssize_t err; size_t seglen; /* limit the amount of readable buffers */ if (!af_alg_readable(sk)) break; seglen = min_t(size_t, (maxsize - len), msg_data_left(msg)); /* Never pin more pages than the remaining RX accounting budget. */ seglen = min_t(size_t, seglen, af_alg_rcvbuf(sk)); if (list_empty(&areq->rsgl_list)) { rsgl = &areq->first_rsgl; } else { rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL); if (unlikely(!rsgl)) return -ENOMEM; } rsgl->sgl.need_unpin = iov_iter_extract_will_pin(&msg->msg_iter); rsgl->sgl.sgt.sgl = rsgl->sgl.sgl; rsgl->sgl.sgt.nents = 0; rsgl->sgl.sgt.orig_nents = 0; list_add_tail(&rsgl->list, &areq->rsgl_list); sg_init_table(rsgl->sgl.sgt.sgl, ALG_MAX_PAGES); err = extract_iter_to_sg(&msg->msg_iter, seglen, &rsgl->sgl.sgt, ALG_MAX_PAGES, 0); if (err < 0) { rsgl->sg_num_bytes = 0; return err; } sg_mark_end(rsgl->sgl.sgt.sgl + rsgl->sgl.sgt.nents - 1); /* chain the new scatterlist with previous one */ if (areq->last_rsgl) af_alg_link_sg(&areq->last_rsgl->sgl, &rsgl->sgl); areq->last_rsgl = rsgl; len += err; atomic_add(err, &ctx->rcvused); rsgl->sg_num_bytes = err; } *outlen = len; return 0; } EXPORT_SYMBOL_GPL(af_alg_get_rsgl); static int __init af_alg_init(void) { int err = proto_register(&alg_proto, 0); if (err) goto out; err = sock_register(&alg_family); if (err != 0) goto out_unregister_proto; out: return err; out_unregister_proto: proto_unregister(&alg_proto); goto out; } static void __exit af_alg_exit(void) { sock_unregister(PF_ALG); proto_unregister(&alg_proto); } module_init(af_alg_init); module_exit(af_alg_exit); MODULE_DESCRIPTION("Crypto userspace interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_ALG);
521 551 457 466 458 2 1 1 1 1 1 2 2 2 1 2 2 1 1 1 1 1 5 2 1 1 463 466 466 462 462 459 3 463 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; char *name __free(kfree) = strndup_user(__name, PATH_MAX); int err, index; if (IS_ERR(name)) return PTR_ERR(name); err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res = -EINVAL; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) { if (index == 0) { if (try_module_get(tmp->owner)) res = 0; break; } } read_unlock(&file_systems_lock); if (res) return res; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type);
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 /* * fs/nfs/idmap.c * * UID and GID to name mapping for clients. * * Copyright (c) 2002 The Regents of the University of Michigan. * All rights reserved. * * Marius Aamodt Eriksen <marius@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <keys/request_key_auth-type.h> #include <linux/module.h> #include <linux/user_namespace.h> #include "internal.h" #include "netns.h" #include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; struct key *authkey; struct idmap *idmap; }; struct idmap { struct rpc_pipe_dir_object idmap_pdo; struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; struct user_namespace *user_ns; }; static struct user_namespace *idmap_userns(const struct idmap *idmap) { if (idmap && idmap->user_ns) return idmap->user_ns; return &init_user_ns; } /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr * @owner_name: owner name string cache * @group_name: group name string cache */ void nfs_fattr_init_names(struct nfs_fattr *fattr, struct nfs4_string *owner_name, struct nfs4_string *group_name) { fattr->owner_name = owner_name; fattr->group_name = group_name; } static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) { fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; kfree(fattr->owner_name->data); } static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) { fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; kfree(fattr->group_name->data); } static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *owner = fattr->owner_name; kuid_t uid; if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) return false; if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { fattr->uid = uid; fattr->valid |= NFS_ATTR_FATTR_OWNER; } return true; } static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *group = fattr->group_name; kgid_t gid; if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) return false; if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { fattr->gid = gid; fattr->valid |= NFS_ATTR_FATTR_GROUP; } return true; } /** * nfs_fattr_free_names - free up the NFSv4 owner and group strings * @fattr: a fully initialised nfs_fattr structure */ void nfs_fattr_free_names(struct nfs_fattr *fattr) { if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) nfs_fattr_free_owner_name(fattr); if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) nfs_fattr_free_group_name(fattr); } /** * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free * @server: pointer to the filesystem nfs_server structure * @fattr: a fully initialised nfs_fattr structure * * This helper maps the cached NFSv4 owner/group strings in fattr into * their numeric uid/gid equivalents, and then frees the cached strings. */ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) { if (nfs_fattr_map_owner_name(server, fattr)) nfs_fattr_free_owner_name(fattr); if (nfs_fattr_map_group_name(server, fattr)) nfs_fattr_free_group_name(fattr); } int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { unsigned long val; char buf[16]; if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) return 0; memcpy(buf, name, namelen); buf[namelen] = '\0'; if (kstrtoul(buf, 0, &val) != 0) return 0; *res = val; return 1; } EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) { return snprintf(buf, buflen, "%u", id); } static struct key_type key_type_id_resolver = { .name = "id_resolver", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, }; int nfs_idmap_init(void) { struct cred *cred; struct key *keyring; int ret = 0; printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; keyring = keyring_alloc(".id_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; ret = register_key_type(&key_type_id_resolver_legacy); if (ret < 0) goto failed_reg_legacy; set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); cred->thread_keyring = keyring; cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; id_resolver_cache = cred; return 0; failed_reg_legacy: unregister_key_type(&key_type_id_resolver); failed_put_key: key_put(keyring); failed_put_cred: put_cred(cred); return ret; } void nfs_idmap_quit(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); unregister_key_type(&key_type_id_resolver_legacy); put_cred(id_resolver_cache); } /* * Assemble the description to pass to request_key() * This function will allocate a new string and update dest to point * at it. The caller is responsible for freeing dest. * * On error 0 is returned. Otherwise, the length of dest is returned. */ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, const char *type, size_t typelen, char **desc) { char *cp; size_t desclen = typelen + namelen + 2; *desc = kmalloc(desclen, GFP_KERNEL); if (!*desc) return -ENOMEM; cp = *desc; memcpy(cp, type, typelen); cp += typelen; *cp++ = ':'; memcpy(cp, name, namelen); cp += namelen; *cp = '\0'; return desclen; } static struct key *nfs_idmap_request_key(const char *name, size_t namelen, const char *type, struct idmap *idmap) { char *desc; struct key *rkey = ERR_PTR(-EAGAIN); ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); if (ret < 0) return ERR_PTR(ret); if (!idmap->user_ns || idmap->user_ns == &init_user_ns) rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, desc, NULL, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } if (!IS_ERR(rkey)) set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); kfree(desc); return rkey; } static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { struct key *rkey; const struct user_key_payload *payload; ssize_t ret; scoped_with_creds(id_resolver_cache) rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; } rcu_read_lock(); rkey->perm |= KEY_USR_VIEW; ret = key_validate(rkey); if (ret < 0) goto out_up; payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; } ret = payload->datalen; if (ret > 0 && ret <= data_size) memcpy(data, payload->data, ret); else ret = -EINVAL; out_up: rcu_read_unlock(); key_put(rkey); out: return ret; } /* ID -> Name */ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; int id_len; ssize_t ret; id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str)); ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); if (ret < 0) return -EINVAL; return ret; } /* Name -> ID */ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, __u32 *id, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; long id_long; ssize_t data_size; int ret = 0; data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); if (data_size <= 0) { ret = -EINVAL; } else { ret = kstrtol(id_str, 10, &id_long); if (!ret) *id = (__u32)id_long; } return ret; } /* idmap classic begins here */ enum { Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err }; static const match_table_t nfs_idmap_tokens = { { Opt_find_uid, "uid:%s" }, { Opt_find_gid, "gid:%s" }, { Opt_find_user, "user:%s" }, { Opt_find_group, "group:%s" }, { Opt_find_err, NULL } }; static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .release_pipe = idmap_release_pipe, .destroy_msg = idmap_pipe_destroy_msg, }; static struct key_type key_type_id_resolver_legacy = { .name = "id_legacy", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, .request_key = nfs_idmap_legacy_upcall, }; static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { .create = nfs_idmap_pipe_create, .destroy = nfs_idmap_pipe_destroy, }; int nfs_idmap_new(struct nfs_client *clp) { struct idmap *idmap; struct rpc_pipe *pipe; int error; idmap = kzalloc_obj(*idmap); if (idmap == NULL) return -ENOMEM; mutex_init(&idmap->idmap_mutex); idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns); rpc_init_pipe_dir_object(&idmap->idmap_pdo, &nfs_idmap_pipe_dir_object_ops, idmap); pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); if (IS_ERR(pipe)) { error = PTR_ERR(pipe); goto err; } idmap->idmap_pipe = pipe; error = rpc_add_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); if (error) goto err_destroy_pipe; clp->cl_idmap = idmap; return 0; err_destroy_pipe: rpc_destroy_pipe_data(idmap->idmap_pipe); err: put_user_ns(idmap->user_ns); kfree(idmap); return error; } void nfs_idmap_delete(struct nfs_client *clp) { struct idmap *idmap = clp->cl_idmap; if (!idmap) return; clp->cl_idmap = NULL; rpc_remove_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); rpc_destroy_pipe_data(idmap->idmap_pipe); put_user_ns(idmap->user_ns); kfree(idmap); } static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, struct idmap_msg *im, struct rpc_pipe_msg *msg) { substring_t substr; int token, ret; im->im_type = IDMAP_TYPE_GROUP; token = match_token(desc, nfs_idmap_tokens, &substr); switch (token) { case Opt_find_uid: im->im_type = IDMAP_TYPE_USER; fallthrough; case Opt_find_gid: im->im_conv = IDMAP_CONV_NAMETOID; ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); break; case Opt_find_user: im->im_type = IDMAP_TYPE_USER; fallthrough; case Opt_find_group: im->im_conv = IDMAP_CONV_IDTONAME; ret = match_int(&substr, &im->im_id); if (ret) goto out; break; default: ret = -EINVAL; goto out; } msg->data = im; msg->len = sizeof(struct idmap_msg); out: return ret; } static bool nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, struct idmap_legacy_upcalldata *data) { if (idmap->idmap_upcall_data != NULL) { WARN_ON_ONCE(1); return false; } idmap->idmap_upcall_data = data; return true; } static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data, int ret) { complete_request_key(data->authkey, ret); key_put(data->authkey); kfree(data); } static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, struct idmap_legacy_upcalldata *data, int ret) { if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data) nfs_idmap_complete_pipe_upcall(data, ret); } static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ ret = -ENOMEM; data = kzalloc_obj(*data); if (!data) goto out1; msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) goto out2; ret = -EAGAIN; if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) goto out2; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) nfs_idmap_abort_pipe_upcall(idmap, data, ret); return ret; out2: kfree(data); out1: complete_request_key(authkey, ret); return ret; } static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct idmap_msg *upcall, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) goto out; switch (im->im_conv) { case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; /* Note: here we store the NUL terminator too */ len = 1 + nfs_map_numeric_to_string(im->im_id, id_str, sizeof(id_str)); ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; len = strlen(im->im_name); ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; } out: return ret; } static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; struct idmap_legacy_upcalldata *data; struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; /* If instantiation is successful, anyone waiting for key construction * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ data = xchg(&idmap->idmap_upcall_data, NULL); if (data == NULL) goto out_noupcall; authkey = data->authkey; rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; goto out; } if (copy_from_user(&im, src, mlen) != 0) { ret = -EFAULT; goto out; } if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { ret = -ENOKEY; goto out; } namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; } ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg, rka->target_key, authkey); if (ret >= 0) { key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } out: nfs_idmap_complete_pipe_upcall(data, ret); out_noupcall: return ret; } static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct idmap_legacy_upcalldata *data = container_of(msg, struct idmap_legacy_upcalldata, pipe_msg); struct idmap *idmap = data->idmap; if (msg->errno) nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno); } static void idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; struct idmap_legacy_upcalldata *data; data = xchg(&idmap->idmap_upcall_data, NULL); if (data) nfs_idmap_complete_pipe_upcall(data, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) { struct idmap *idmap = server->nfs_client->cl_idmap; __u32 id = -1; int ret = 0; if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); if (ret == 0) { *uid = make_kuid(idmap_userns(idmap), id); if (!uid_valid(*uid)) ret = -ERANGE; } trace_nfs4_map_name_to_uid(name, namelen, id, ret); return ret; } int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) { struct idmap *idmap = server->nfs_client->cl_idmap; __u32 id = -1; int ret = 0; if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); if (ret == 0) { *gid = make_kgid(idmap_userns(idmap), id); if (!gid_valid(*gid)) ret = -ERANGE; } trace_nfs4_map_group_to_gid(name, namelen, id, ret); return ret; } int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; __u32 id; id = from_kuid_munged(idmap_userns(idmap), uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); trace_nfs4_map_uid_to_name(buf, ret, id, ret); return ret; } int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; __u32 id; id = from_kgid_munged(idmap_userns(idmap), gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); trace_nfs4_map_gid_to_group(buf, ret, id, ret); return ret; }
57 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KCOV_H #define _LINUX_KCOV_H #include <linux/sched.h> #include <uapi/linux/kcov.h> struct task_struct; #ifdef CONFIG_KCOV enum kcov_mode { /* Coverage collection is not enabled yet. */ KCOV_MODE_DISABLED = 0, /* KCOV was initialized, but tracing mode hasn't been chosen yet. */ KCOV_MODE_INIT = 1, /* * Tracing coverage collection mode. * Covered PCs are collected in a per-task buffer. */ KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, /* The process owns a KCOV remote reference. */ KCOV_MODE_REMOTE = 4, }; #define KCOV_IN_CTXSW (1 << 30) void kcov_task_init(struct task_struct *t); void kcov_task_exit(struct task_struct *t); #define kcov_prepare_switch(t) \ do { \ (t)->kcov_mode |= KCOV_IN_CTXSW; \ } while (0) #define kcov_finish_switch(t) \ do { \ (t)->kcov_mode &= ~KCOV_IN_CTXSW; \ } while (0) /* See Documentation/dev-tools/kcov.rst for usage details. */ void kcov_remote_start(u64 handle); void kcov_remote_stop(void); u64 kcov_common_handle(void); static inline void kcov_remote_start_common(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, id)); } static inline void kcov_remote_start_usb(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id)); } /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary * work around for kcov's lack of nested remote coverage sections support in * task context. Adding support for nested sections is tracked in: * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ static inline void kcov_remote_start_usb_softirq(u64 id) { if (in_serving_softirq() && !in_hardirq()) kcov_remote_start_usb(id); } static inline void kcov_remote_stop_softirq(void) { if (in_serving_softirq() && !in_hardirq()) kcov_remote_stop(); } #ifdef CONFIG_64BIT typedef unsigned long kcov_u64; #else typedef unsigned long long kcov_u64; #endif void __sanitizer_cov_trace_pc(void); void __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_switch(kcov_u64 val, void *cases); #else static inline void kcov_task_init(struct task_struct *t) {} static inline void kcov_task_exit(struct task_struct *t) {} static inline void kcov_prepare_switch(struct task_struct *t) {} static inline void kcov_finish_switch(struct task_struct *t) {} static inline void kcov_remote_start(u64 handle) {} static inline void kcov_remote_stop(void) {} static inline u64 kcov_common_handle(void) { return 0; } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} static inline void kcov_remote_start_usb_softirq(u64 id) {} static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 // SPDX-License-Identifier: GPL-2.0-or-later /* * lzx_decompress.c - A decompressor for the LZX compression format, which can * be used in "System Compressed" files. This is based on the code from wimlib. * This code only supports a window size (dictionary size) of 32768 bytes, since * this is the only size used in System Compression. * * Copyright (C) 2015 Eric Biggers */ #include "decompress_common.h" #include "lib.h" /* Number of literal byte values */ #define LZX_NUM_CHARS 256 /* The smallest and largest allowed match lengths */ #define LZX_MIN_MATCH_LEN 2 #define LZX_MAX_MATCH_LEN 257 /* Number of distinct match lengths that can be represented */ #define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) /* Number of match lengths for which no length symbol is required */ #define LZX_NUM_PRIMARY_LENS 7 #define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) /* Valid values of the 3-bit block type field */ #define LZX_BLOCKTYPE_VERBATIM 1 #define LZX_BLOCKTYPE_ALIGNED 2 #define LZX_BLOCKTYPE_UNCOMPRESSED 3 /* Number of offset slots for a window size of 32768 */ #define LZX_NUM_OFFSET_SLOTS 30 /* Number of symbols in the main code for a window size of 32768 */ #define LZX_MAINCODE_NUM_SYMBOLS \ (LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) /* Number of symbols in the length code */ #define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) /* Number of symbols in the precode */ #define LZX_PRECODE_NUM_SYMBOLS 20 /* Number of bits in which each precode codeword length is represented */ #define LZX_PRECODE_ELEMENT_SIZE 4 /* Number of low-order bits of each match offset that are entropy-encoded in * aligned offset blocks */ #define LZX_NUM_ALIGNED_OFFSET_BITS 3 /* Number of symbols in the aligned offset code */ #define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) /* Mask for the match offset bits that are entropy-encoded in aligned offset * blocks */ #define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) /* Number of bits in which each aligned offset codeword length is represented */ #define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 /* Maximum lengths (in bits) of the codewords in each Huffman code */ #define LZX_MAX_MAIN_CODEWORD_LEN 16 #define LZX_MAX_LEN_CODEWORD_LEN 16 #define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) #define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) /* The default "filesize" value used in pre/post-processing. In the LZX format * used in cabinet files this value must be given to the decompressor, whereas * in the LZX format used in WIM files and system-compressed files this value is * fixed at 12000000. */ #define LZX_DEFAULT_FILESIZE 12000000 /* Assumed block size when the encoded block size begins with a 0 bit. */ #define LZX_DEFAULT_BLOCK_SIZE 32768 /* Number of offsets in the recent (or "repeat") offsets queue. */ #define LZX_NUM_RECENT_OFFSETS 3 /* These values are chosen for fast decompression. */ #define LZX_MAINCODE_TABLEBITS 11 #define LZX_LENCODE_TABLEBITS 10 #define LZX_PRECODE_TABLEBITS 6 #define LZX_ALIGNEDCODE_TABLEBITS 7 #define LZX_READ_LENS_MAX_OVERRUN 50 /* Mapping: offset slot => first match offset that uses that offset slot. */ static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = { 0, 1, 2, 3, 4, /* 0 --- 4 */ 6, 8, 12, 16, 24, /* 5 --- 9 */ 32, 48, 64, 96, 128, /* 10 --- 14 */ 192, 256, 384, 512, 768, /* 15 --- 19 */ 1024, 1536, 2048, 3072, 4096, /* 20 --- 24 */ 6144, 8192, 12288, 16384, 24576, /* 25 --- 29 */ 32768, /* extra */ }; /* Mapping: offset slot => how many extra bits must be read and added to the * corresponding offset slot base to decode the match offset. */ static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, }; /* Reusable heap-allocated memory for LZX decompression */ struct lzx_decompressor { /* Huffman decoding tables, and arrays that map symbols to codeword * lengths */ u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) + (LZX_MAINCODE_NUM_SYMBOLS * 2)]; u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) + (LZX_LENCODE_NUM_SYMBOLS * 2)]; u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) + (LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)]; u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) + (LZX_PRECODE_NUM_SYMBOLS * 2)]; u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS]; /* Temporary space for make_huffman_decode_table() */ u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) + LZX_MAINCODE_NUM_SYMBOLS]; }; static void undo_e8_translation(void *target, s32 input_pos) { s32 abs_offset, rel_offset; abs_offset = get_unaligned_le32(target); if (abs_offset >= 0) { if (abs_offset < LZX_DEFAULT_FILESIZE) { /* "good translation" */ rel_offset = abs_offset - input_pos; put_unaligned_le32(rel_offset, target); } } else { if (abs_offset >= -input_pos) { /* "compensating translation" */ rel_offset = abs_offset + LZX_DEFAULT_FILESIZE; put_unaligned_le32(rel_offset, target); } } } /* * Undo the 'E8' preprocessing used in LZX. Before compression, the * uncompressed data was preprocessed by changing the targets of suspected x86 * CALL instructions from relative offsets to absolute offsets. After * match/literal decoding, the decompressor must undo the translation. */ static void lzx_postprocess(u8 *data, u32 size) { /* * A worthwhile optimization is to push the end-of-buffer check into the * relatively rare E8 case. This is possible if we replace the last six * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte * before reaching end-of-buffer. In addition, this scheme guarantees * that no translation can begin following an E8 byte in the last 10 * bytes because a 4-byte offset containing E8 as its high byte is a * large negative number that is not valid for translation. That is * exactly what we need. */ u8 *tail; u8 saved_bytes[6]; u8 *p; if (size <= 10) return; tail = &data[size - 6]; memcpy(saved_bytes, tail, 6); memset(tail, 0xE8, 6); p = data; for (;;) { while (*p != 0xE8) p++; if (p >= tail) break; undo_e8_translation(p + 1, p - data); p += 5; } memcpy(tail, saved_bytes, 6); } /* Read a Huffman-encoded symbol using the precode. */ static forceinline u32 read_presym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->precode_decode_table, LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN); } /* Read a Huffman-encoded symbol using the main code. */ static forceinline u32 read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->maincode_decode_table, LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN); } /* Read a Huffman-encoded symbol using the length code. */ static forceinline u32 read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->lencode_decode_table, LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN); } /* Read a Huffman-encoded symbol using the aligned offset code. */ static forceinline u32 read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is) { return read_huffsym(is, d->alignedcode_decode_table, LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN); } /* * Read the precode from the compressed input bitstream, then use it to decode * @num_lens codeword length values. * * @is: The input bitstream. * * @lens: An array that contains the length values from the previous time * the codeword lengths for this Huffman code were read, or all 0's * if this is the first time. This array must have at least * (@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries. * * @num_lens: Number of length values to decode. * * Returns 0 on success, or -1 if the data was invalid. */ static int lzx_read_codeword_lens(struct lzx_decompressor *d, struct input_bitstream *is, u8 *lens, u32 num_lens) { u8 *len_ptr = lens; u8 *lens_end = lens + num_lens; int i; /* Read the lengths of the precode codewords. These are given * explicitly. */ for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) { d->precode_lens[i] = bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE); } /* Make the decoding table for the precode. */ if (make_huffman_decode_table(d->precode_decode_table, LZX_PRECODE_NUM_SYMBOLS, LZX_PRECODE_TABLEBITS, d->precode_lens, LZX_MAX_PRE_CODEWORD_LEN, d->working_space)) return -1; /* Decode the codeword lengths. */ do { u32 presym; u8 len; /* Read the next precode symbol. */ presym = read_presym(d, is); if (presym < 17) { /* Difference from old length */ len = *len_ptr - presym; if ((s8)len < 0) len += 17; *len_ptr++ = len; } else { /* Special RLE values */ u32 run_len; if (presym == 17) { /* Run of 0's */ run_len = 4 + bitstream_read_bits(is, 4); len = 0; } else if (presym == 18) { /* Longer run of 0's */ run_len = 20 + bitstream_read_bits(is, 5); len = 0; } else { /* Run of identical lengths */ run_len = 4 + bitstream_read_bits(is, 1); presym = read_presym(d, is); if (presym > 17) return -1; len = *len_ptr - presym; if ((s8)len < 0) len += 17; } do { *len_ptr++ = len; } while (--run_len); /* Worst case overrun is when presym == 18, * run_len == 20 + 31, and only 1 length was remaining. * So LZX_READ_LENS_MAX_OVERRUN == 50. * * Overrun while reading the first half of maincode_lens * can corrupt the previous values in the second half. * This doesn't really matter because the resulting * lengths will still be in range, and data that * generates overruns is invalid anyway. */ } } while (len_ptr < lens_end); return 0; } /* * Read the header of an LZX block and save the block type and (uncompressed) * size in *block_type_ret and *block_size_ret, respectively. * * If the block is compressed, also update the Huffman decode @tables with the * new Huffman codes. If the block is uncompressed, also update the match * offset @queue with the new match offsets. * * Return 0 on success, or -1 if the data was invalid. */ static int lzx_read_block_header(struct lzx_decompressor *d, struct input_bitstream *is, int *block_type_ret, u32 *block_size_ret, u32 recent_offsets[]) { int block_type; u32 block_size; int i; bitstream_ensure_bits(is, 4); /* The first three bits tell us what kind of block it is, and should be * one of the LZX_BLOCKTYPE_* values. */ block_type = bitstream_pop_bits(is, 3); /* Read the block size. */ if (bitstream_pop_bits(is, 1)) { block_size = LZX_DEFAULT_BLOCK_SIZE; } else { block_size = 0; block_size |= bitstream_read_bits(is, 8); block_size <<= 8; block_size |= bitstream_read_bits(is, 8); } switch (block_type) { case LZX_BLOCKTYPE_ALIGNED: /* Read the aligned offset code and prepare its decode table. */ for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { d->alignedcode_lens[i] = bitstream_read_bits(is, LZX_ALIGNEDCODE_ELEMENT_SIZE); } if (make_huffman_decode_table(d->alignedcode_decode_table, LZX_ALIGNEDCODE_NUM_SYMBOLS, LZX_ALIGNEDCODE_TABLEBITS, d->alignedcode_lens, LZX_MAX_ALIGNED_CODEWORD_LEN, d->working_space)) return -1; /* Fall though, since the rest of the header for aligned offset * blocks is the same as that for verbatim blocks. */ fallthrough; case LZX_BLOCKTYPE_VERBATIM: /* Read the main code and prepare its decode table. * * Note that the codeword lengths in the main code are encoded * in two parts: one part for literal symbols, and one part for * match symbols. */ if (lzx_read_codeword_lens(d, is, d->maincode_lens, LZX_NUM_CHARS)) return -1; if (lzx_read_codeword_lens(d, is, d->maincode_lens + LZX_NUM_CHARS, LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS)) return -1; if (make_huffman_decode_table(d->maincode_decode_table, LZX_MAINCODE_NUM_SYMBOLS, LZX_MAINCODE_TABLEBITS, d->maincode_lens, LZX_MAX_MAIN_CODEWORD_LEN, d->working_space)) return -1; /* Read the length code and prepare its decode table. */ if (lzx_read_codeword_lens(d, is, d->lencode_lens, LZX_LENCODE_NUM_SYMBOLS)) return -1; if (make_huffman_decode_table(d->lencode_decode_table, LZX_LENCODE_NUM_SYMBOLS, LZX_LENCODE_TABLEBITS, d->lencode_lens, LZX_MAX_LEN_CODEWORD_LEN, d->working_space)) return -1; break; case LZX_BLOCKTYPE_UNCOMPRESSED: /* Before reading the three recent offsets from the uncompressed * block header, the stream must be aligned on a 16-bit * boundary. But if the stream is *already* aligned, then the * next 16 bits must be discarded. */ bitstream_ensure_bits(is, 1); bitstream_align(is); recent_offsets[0] = bitstream_read_u32(is); recent_offsets[1] = bitstream_read_u32(is); recent_offsets[2] = bitstream_read_u32(is); /* Offsets of 0 are invalid. */ if (recent_offsets[0] == 0 || recent_offsets[1] == 0 || recent_offsets[2] == 0) return -1; break; default: /* Unrecognized block type. */ return -1; } *block_type_ret = block_type; *block_size_ret = block_size; return 0; } /* Decompress a block of LZX-compressed data. */ static int lzx_decompress_block(const struct lzx_decompressor *d, struct input_bitstream *is, int block_type, u32 block_size, u8 * const out_begin, u8 *out_next, u32 recent_offsets[]) { u8 * const block_end = out_next + block_size; u32 ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED); do { u32 mainsym; u32 match_len; u32 match_offset; u32 offset_slot; u32 num_extra_bits; mainsym = read_mainsym(d, is); if (mainsym < LZX_NUM_CHARS) { /* Literal */ *out_next++ = mainsym; continue; } /* Match */ /* Decode the length header and offset slot. */ mainsym -= LZX_NUM_CHARS; match_len = mainsym % LZX_NUM_LEN_HEADERS; offset_slot = mainsym / LZX_NUM_LEN_HEADERS; /* If needed, read a length symbol to decode the full length. */ if (match_len == LZX_NUM_PRIMARY_LENS) match_len += read_lensym(d, is); match_len += LZX_MIN_MATCH_LEN; if (offset_slot < LZX_NUM_RECENT_OFFSETS) { /* Repeat offset */ /* Note: This isn't a real LRU queue, since using the R2 * offset doesn't bump the R1 offset down to R2. This * quirk allows all 3 recent offsets to be handled by * the same code. (For R0, the swap is a no-op.) */ match_offset = recent_offsets[offset_slot]; swap(recent_offsets[offset_slot], recent_offsets[0]); } else { /* Explicit offset */ /* Look up the number of extra bits that need to be read * to decode offsets with this offset slot. */ num_extra_bits = lzx_extra_offset_bits[offset_slot]; /* Start with the offset slot base value. */ match_offset = lzx_offset_slot_base[offset_slot]; /* In aligned offset blocks, the low-order 3 bits of * each offset are encoded using the aligned offset * code. Otherwise, all the extra bits are literal. */ if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { match_offset += bitstream_read_bits(is, num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS) << LZX_NUM_ALIGNED_OFFSET_BITS; match_offset += read_alignedsym(d, is); } else { match_offset += bitstream_read_bits(is, num_extra_bits); } /* Adjust the offset. */ match_offset -= (LZX_NUM_RECENT_OFFSETS - 1); /* Update the recent offsets. */ recent_offsets[2] = recent_offsets[1]; recent_offsets[1] = recent_offsets[0]; recent_offsets[0] = match_offset; } /* Validate the match, then copy it to the current position. */ if (match_len > (size_t)(block_end - out_next)) return -1; if (match_offset > (size_t)(out_next - out_begin)) return -1; out_next = lz_copy(out_next, match_len, match_offset, block_end, LZX_MIN_MATCH_LEN); } while (out_next != block_end); return 0; } /* * lzx_allocate_decompressor - Allocate an LZX decompressor * * Return the pointer to the decompressor on success, or return NULL and set * errno on failure. */ struct lzx_decompressor *lzx_allocate_decompressor(void) { return kmalloc_obj(struct lzx_decompressor, GFP_NOFS); } /* * lzx_decompress - Decompress a buffer of LZX-compressed data * * @decompressor: A decompressor allocated with lzx_allocate_decompressor() * @compressed_data: The buffer of data to decompress * @compressed_size: Number of bytes of compressed data * @uncompressed_data: The buffer in which to store the decompressed data * @uncompressed_size: The number of bytes the data decompresses into * * Return 0 on success, or return -1 and set errno on failure. */ int lzx_decompress(struct lzx_decompressor *decompressor, const void *compressed_data, size_t compressed_size, void *uncompressed_data, size_t uncompressed_size) { struct lzx_decompressor *d = decompressor; u8 * const out_begin = uncompressed_data; u8 *out_next = out_begin; u8 * const out_end = out_begin + uncompressed_size; struct input_bitstream is; u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1}; int e8_status = 0; init_input_bitstream(&is, compressed_data, compressed_size); /* Codeword lengths begin as all 0's for delta encoding purposes. */ memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS); memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS); /* Decompress blocks until we have all the uncompressed data. */ while (out_next != out_end) { int block_type; u32 block_size; if (lzx_read_block_header(d, &is, &block_type, &block_size, recent_offsets)) goto invalid; if (block_size < 1 || block_size > (size_t)(out_end - out_next)) goto invalid; if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) { /* Compressed block */ if (lzx_decompress_block(d, &is, block_type, block_size, out_begin, out_next, recent_offsets)) goto invalid; e8_status |= d->maincode_lens[0xe8]; out_next += block_size; } else { /* Uncompressed block */ out_next = bitstream_read_bytes(&is, out_next, block_size); if (!out_next) goto invalid; if (block_size & 1) bitstream_read_byte(&is); e8_status = 1; } } /* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */ if (e8_status) lzx_postprocess(uncompressed_data, uncompressed_size); return 0; invalid: return -1; } /* * lzx_free_decompressor - Free an LZX decompressor * * @decompressor: A decompressor that was allocated with * lzx_allocate_decompressor(), or NULL. */ void lzx_free_decompressor(struct lzx_decompressor *decompressor) { kfree(decompressor); }
5 5 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * * Used to allow a cgroup hierarchy to stop processes from consuming * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> */ #include <linux/bitops.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/cgroup.h> #include <linux/parser.h> #include <linux/cgroup_rdma.h> #define RDMACG_MAX_STR "max" /* * Protects list of resource pools maintained on per cgroup basis * and rdma device list. */ static DEFINE_MUTEX(rdmacg_mutex); static LIST_HEAD(rdmacg_devices); enum rdmacg_file_type { RDMACG_RESOURCE_TYPE_MAX, RDMACG_RESOURCE_TYPE_STAT, }; /* * resource table definition as to be seen by the user. * Need to add entries to it when more resources are * added/defined at IB verb/core layer. */ static char const *rdmacg_resource_names[] = { [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", }; /* resource tracker for each resource of rdma cgroup */ struct rdmacg_resource { int max; int usage; }; /* * resource pool object which represents per cgroup, per device * resources. There are multiple instances of this object per cgroup, * therefore it cannot be embedded within rdma_cgroup structure. It * is maintained as list. */ struct rdmacg_resource_pool { struct rdmacg_device *device; struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; struct list_head cg_node; struct list_head dev_node; /* count active user tasks of this pool */ u64 usage_sum; /* total number counts which are set to max */ int num_max_cnt; }; static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) { return container_of(css, struct rdma_cgroup, css); } static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) { return css_rdmacg(cg->css.parent); } static inline struct rdma_cgroup *get_current_rdmacg(void) { return css_rdmacg(task_get_css(current, rdma_cgrp_id)); } static void set_resource_limit(struct rdmacg_resource_pool *rpool, int index, int new_max) { if (new_max == S32_MAX) { if (rpool->resources[index].max != S32_MAX) rpool->num_max_cnt++; } else { if (rpool->resources[index].max == S32_MAX) rpool->num_max_cnt--; } rpool->resources[index].max = new_max; } static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) { int i; for (i = 0; i < RDMACG_RESOURCE_MAX; i++) set_resource_limit(rpool, i, S32_MAX); } static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) { lockdep_assert_held(&rdmacg_mutex); list_del(&rpool->cg_node); list_del(&rpool->dev_node); kfree(rpool); } static struct rdmacg_resource_pool * find_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) { struct rdmacg_resource_pool *pool; lockdep_assert_held(&rdmacg_mutex); list_for_each_entry(pool, &cg->rpools, cg_node) if (pool->device == device) return pool; return NULL; } static struct rdmacg_resource_pool * get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) { struct rdmacg_resource_pool *rpool; rpool = find_cg_rpool_locked(cg, device); if (rpool) return rpool; rpool = kzalloc_obj(*rpool); if (!rpool) return ERR_PTR(-ENOMEM); rpool->device = device; set_all_resource_max_limit(rpool); INIT_LIST_HEAD(&rpool->cg_node); INIT_LIST_HEAD(&rpool->dev_node); list_add_tail(&rpool->cg_node, &cg->rpools); list_add_tail(&rpool->dev_node, &device->rpools); return rpool; } /** * uncharge_cg_locked - uncharge resource for rdma cgroup * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cg (resource pool) * * It also frees the resource pool which was created as part of * charging operation when there are no resources attached to * resource pool. */ static void uncharge_cg_locked(struct rdma_cgroup *cg, struct rdmacg_device *device, enum rdmacg_resource_type index) { struct rdmacg_resource_pool *rpool; rpool = find_cg_rpool_locked(cg, device); /* * rpool cannot be null at this stage. Let kernel operate in case * if there a bug in IB stack or rdma controller, instead of crashing * the system. */ if (unlikely(!rpool)) { pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg); return; } rpool->resources[index].usage--; /* * A negative count (or overflow) is invalid, * it indicates a bug in the rdma controller. */ WARN_ON_ONCE(rpool->resources[index].usage < 0); rpool->usage_sum--; if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { /* * No user of the rpool and all entries are set to max, so * safe to delete this rpool. */ free_cg_rpool_locked(rpool); } } /** * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup * stop uncharging * @index: index of the resource to uncharge in cg in given resource pool */ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, struct rdmacg_device *device, struct rdma_cgroup *stop_cg, enum rdmacg_resource_type index) { struct rdma_cgroup *p; mutex_lock(&rdmacg_mutex); for (p = cg; p != stop_cg; p = parent_rdmacg(p)) uncharge_cg_locked(p, device, index); mutex_unlock(&rdmacg_mutex); css_put(&cg->css); } /** * rdmacg_uncharge - hierarchically uncharge rdma resource count * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cgroup in given resource pool */ void rdmacg_uncharge(struct rdma_cgroup *cg, struct rdmacg_device *device, enum rdmacg_resource_type index) { if (index >= RDMACG_RESOURCE_MAX) return; rdmacg_uncharge_hierarchy(cg, device, NULL, index); } EXPORT_SYMBOL(rdmacg_uncharge); /** * rdmacg_try_charge - hierarchically try to charge the rdma resource * @rdmacg: pointer to rdma cgroup which will own this resource * @device: pointer to rdmacg device * @index: index of the resource to charge in cgroup (resource pool) * * This function follows charging resource in hierarchical way. * It will fail if the charge would cause the new value to exceed the * hierarchical limit. * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. * Returns pointer to rdmacg for this resource when charging is successful. * * Charger needs to account resources on two criteria. * (a) per cgroup & (b) per device resource usage. * Per cgroup resource usage ensures that tasks of cgroup doesn't cross * the configured limits. Per device provides granular configuration * in multi device usage. It allocates resource pool in the hierarchy * for each parent it come across for first resource. Later on resource * pool will be available. Therefore it will be much faster thereon * to charge/uncharge. */ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, struct rdmacg_device *device, enum rdmacg_resource_type index) { struct rdma_cgroup *cg, *p; struct rdmacg_resource_pool *rpool; s64 new; int ret = 0; if (index >= RDMACG_RESOURCE_MAX) return -EINVAL; /* * hold on to css, as cgroup can be removed but resource * accounting happens on css. */ cg = get_current_rdmacg(); mutex_lock(&rdmacg_mutex); for (p = cg; p; p = parent_rdmacg(p)) { rpool = get_cg_rpool_locked(p, device); if (IS_ERR(rpool)) { ret = PTR_ERR(rpool); goto err; } else { new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; } else { rpool->resources[index].usage = new; rpool->usage_sum++; } } } mutex_unlock(&rdmacg_mutex); *rdmacg = cg; return 0; err: mutex_unlock(&rdmacg_mutex); rdmacg_uncharge_hierarchy(cg, device, p, index); return ret; } EXPORT_SYMBOL(rdmacg_try_charge); /** * rdmacg_register_device - register rdmacg device to rdma controller. * @device: pointer to rdmacg device whose resources need to be accounted. * * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. */ void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); } EXPORT_SYMBOL(rdmacg_register_device); /** * rdmacg_unregister_device - unregister rdmacg device from rdma controller. * @device: pointer to rdmacg device which was previously registered with rdma * controller using rdmacg_register_device(). * * IB stack must invoke this after all the resources of the IB device * are destroyed and after ensuring that no more resources will be created * when this API is invoked. */ void rdmacg_unregister_device(struct rdmacg_device *device) { struct rdmacg_resource_pool *rpool, *tmp; /* * Synchronize with any active resource settings, * usage query happening via configfs. */ mutex_lock(&rdmacg_mutex); list_del_init(&device->dev_node); /* * Now that this device is off the cgroup list, its safe to free * all the rpool resources. */ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) free_cg_rpool_locked(rpool); mutex_unlock(&rdmacg_mutex); } EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; char *name, *value = c; size_t len; int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); if (i < 0) return i; len = strlen(value); argstr.from = value; argstr.to = value + len; ret = match_int(&argstr, intval); if (ret >= 0) { if (*intval < 0) return -EINVAL; return i; } if (strncmp(value, RDMACG_MAX_STR, len) == 0) { *intval = S32_MAX; return i; } return -EINVAL; } static int rdmacg_parse_limits(char *options, int *new_limits, unsigned long *enables) { char *c; int err = -EINVAL; /* parse resource options */ while ((c = strsep(&options, " ")) != NULL) { int index, intval; index = parse_resource(c, &intval); if (index < 0) goto err; new_limits[index] = intval; *enables |= BIT(index); } return 0; err: return err; } static struct rdmacg_device *rdmacg_get_device_locked(const char *name) { struct rdmacg_device *device; lockdep_assert_held(&rdmacg_mutex); list_for_each_entry(device, &rdmacg_devices, dev_node) if (!strcmp(name, device->name)) return device; return NULL; } static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdma_cgroup *cg = css_rdmacg(of_css(of)); const char *dev_name; struct rdmacg_resource_pool *rpool; struct rdmacg_device *device; char *options = strstrip(buf); int *new_limits; unsigned long enables = 0; int i = 0, ret = 0; /* extract the device name first */ dev_name = strsep(&options, " "); if (!dev_name) { ret = -EINVAL; goto err; } new_limits = kzalloc_objs(int, RDMACG_RESOURCE_MAX); if (!new_limits) { ret = -ENOMEM; goto err; } ret = rdmacg_parse_limits(options, new_limits, &enables); if (ret) goto parse_err; /* acquire lock to synchronize with hot plug devices */ mutex_lock(&rdmacg_mutex); device = rdmacg_get_device_locked(dev_name); if (!device) { ret = -ENODEV; goto dev_err; } rpool = get_cg_rpool_locked(cg, device); if (IS_ERR(rpool)) { ret = PTR_ERR(rpool); goto dev_err; } /* now set the new limits of the rpool */ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) set_resource_limit(rpool, i, new_limits[i]); if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { /* * No user of the rpool and all entries are set to max, so * safe to delete this rpool. */ free_cg_rpool_locked(rpool); } dev_err: mutex_unlock(&rdmacg_mutex); parse_err: kfree(new_limits); err: return ret ?: nbytes; } static void print_rpool_values(struct seq_file *sf, struct rdmacg_resource_pool *rpool) { enum rdmacg_file_type sf_type; int i; u32 value; sf_type = seq_cft(sf)->private; for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { seq_puts(sf, rdmacg_resource_names[i]); seq_putc(sf, '='); if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { if (rpool) value = rpool->resources[i].max; else value = S32_MAX; } else { if (rpool) value = rpool->resources[i].usage; else value = 0; } if (value == S32_MAX) seq_puts(sf, RDMACG_MAX_STR); else seq_printf(sf, "%d", value); seq_putc(sf, ' '); } } static int rdmacg_resource_read(struct seq_file *sf, void *v) { struct rdmacg_device *device; struct rdmacg_resource_pool *rpool; struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); mutex_lock(&rdmacg_mutex); list_for_each_entry(device, &rdmacg_devices, dev_node) { seq_printf(sf, "%s ", device->name); rpool = find_cg_rpool_locked(cg, device); print_rpool_values(sf, rpool); seq_putc(sf, '\n'); } mutex_unlock(&rdmacg_mutex); return 0; } static struct cftype rdmacg_files[] = { { .name = "max", .write = rdmacg_resource_set_max, .seq_show = rdmacg_resource_read, .private = RDMACG_RESOURCE_TYPE_MAX, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .seq_show = rdmacg_resource_read, .private = RDMACG_RESOURCE_TYPE_STAT, .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; static struct cgroup_subsys_state * rdmacg_css_alloc(struct cgroup_subsys_state *parent) { struct rdma_cgroup *cg; cg = kzalloc_obj(*cg); if (!cg) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&cg->rpools); return &cg->css; } static void rdmacg_css_free(struct cgroup_subsys_state *css) { struct rdma_cgroup *cg = css_rdmacg(css); kfree(cg); } /** * rdmacg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away and responsible * for shooting down all rdmacg associated with @css. As part of that it * marks all the resource pool entries to max value, so that when resources are * uncharged, associated resource pool can be freed as well. */ static void rdmacg_css_offline(struct cgroup_subsys_state *css) { struct rdma_cgroup *cg = css_rdmacg(css); struct rdmacg_resource_pool *rpool; mutex_lock(&rdmacg_mutex); list_for_each_entry(rpool, &cg->rpools, cg_node) set_all_resource_max_limit(rpool); mutex_unlock(&rdmacg_mutex); } struct cgroup_subsys rdma_cgrp_subsys = { .css_alloc = rdmacg_css_alloc, .css_free = rdmacg_css_free, .css_offline = rdmacg_css_offline, .legacy_cftypes = rdmacg_files, .dfl_cftypes = rdmacg_files, };
71 43 49 43 46 46 3 3 18 18 8 2 6 5 1 2 2 15 5 15 10 1 3 4 1 1 1 2 2 2 2 1 1 2 1 1 15 1 1 1 1 59 1 61 6 1 2 2 1 1 1 1 4 2 2 3 2 2 1 1 2 13 13 13 12 13 12 13 12 11 11 10 10 1 1 1 1 1 2 1 1 1 1 2 1 1 1 7 7 3 3 13 3 1 1 1 71 71 71 71 71 71 71 71 71 49 49 49 49 49 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 // SPDX-License-Identifier: GPL-2.0 #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/nsfs.h> #include <linux/uaccess.h> #include <linux/mnt_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> #include <linux/utsname.h> #include <linux/exportfs.h> #include <linux/nstree.h> #include <net/net_namespace.h> #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; static struct path nsfs_root_path = {}; void nsfs_get_root(struct path *path) { *path = nsfs_root_path; path_get(path); } static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { .unlocked_ioctl = ns_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%llu]", ns_ops->name, inode->i_ino); } const struct dentry_operations ns_dentry_operations = { .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; __ns_ref_active_put(ns); clear_inode(inode); ns->ops->put(ns); } int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { struct ns_common *ns; ns = ns_get_cb(private_data); if (!ns) return -ENOENT; return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path); } struct ns_get_path_task_args { const struct proc_ns_operations *ns_ops; struct task_struct *task; }; static struct ns_common *ns_get_path_task(void *private_data) { struct ns_get_path_task_args *args = private_data; return args->ns_ops->get(args->task); } int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { .ns_ops = ns_ops, .task = task, }; return ns_get_path_cb(path, ns_get_path_task, &args); } struct file *open_namespace_file(struct ns_common *ns) { struct path path __free(path_put) = {}; int err; /* call first to consume reference */ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (err < 0) return ERR_PTR(err); return dentry_open(&path, O_RDONLY, current_cred()); } /** * open_namespace - open a namespace * @ns: the namespace to open * * This will consume a reference to @ns indendent of success or failure. * * Return: A file descriptor on success or a negative error code on failure. */ int open_namespace(struct ns_common *ns) { struct path path __free(path_put) = {}; int err; /* call first to consume reference */ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (err < 0) return err; return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct ns_common *relative; relative = get_ns(ns); if (IS_ERR(relative)) return PTR_ERR(relative); return open_namespace(relative); } EXPORT_SYMBOL_GPL(open_related_ns); static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, struct mnt_ns_info __user *uinfo, size_t usize, struct mnt_ns_info *kinfo) { /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows aobut will be copied and * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) kinfo->nr_mounts--; if (copy_to_user(uinfo, kinfo, kinfo->size)) return -EFAULT; return 0; } static bool nsfs_ioctl_valid(unsigned int cmd) { switch (cmd) { case NS_GET_USERNS: case NS_GET_PARENT: case NS_GET_NSTYPE: case NS_GET_OWNER_UID: case NS_GET_MNTNS_ID: case NS_GET_PID_FROM_PIDNS: case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: case NS_GET_ID: return true; } /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; } static bool may_use_nsfs_ioctl(unsigned int cmd) { switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_NEXT): fallthrough; case _IOC_NR(NS_MNT_GET_PREV): return may_see_all_namespaces(); } return true; } static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; struct ns_common *ns; struct mnt_namespace *mnt_ns; bool previous = false; uid_t __user *argp; uid_t uid; int ret; if (!nsfs_ioctl_valid(ioctl)) return -ENOIOCTLCMD; if (!may_use_nsfs_ioctl(ioctl)) return -EPERM; ns = get_proc_ns(file_inode(filp)); switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); case NS_GET_PARENT: if (!ns->ops->get_parent) return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: return ns->ns_type; case NS_GET_OWNER_UID: if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: fallthrough; case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; pid_ns = container_of(ns, struct pid_namespace, ns); guard(rcu)(); if (ioctl == NS_GET_PID_IN_PIDNS || ioctl == NS_GET_TGID_IN_PIDNS) tsk = find_task_by_vpid(arg); else tsk = find_task_by_pid_ns(arg, pid_ns); if (!tsk) return ret; switch (ioctl) { case NS_GET_PID_FROM_PIDNS: ret = task_pid_vnr(tsk); break; case NS_GET_TGID_FROM_PIDNS: ret = task_tgid_vnr(tsk); break; case NS_GET_PID_IN_PIDNS: ret = task_pid_nr_ns(tsk, pid_ns); break; case NS_GET_TGID_IN_PIDNS: ret = task_tgid_nr_ns(tsk, pid_ns); break; default: ret = 0; break; } if (!ret) ret = -ESRCH; return ret; } case NS_GET_MNTNS_ID: if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { __u64 __user *idp; __u64 id; idp = (__u64 __user *)arg; id = ns->ns_id; return put_user(id, idp); } } /* extensible ioctls */ switch (_IOC_NR(ioctl)) { case _IOC_NR(NS_MNT_GET_INFO): { struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); } case _IOC_NR(NS_MNT_GET_PREV): previous = true; fallthrough; case _IOC_NR(NS_MNT_GET_NEXT): { struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; struct path path __free(path_put) = {}; size_t usize = _IOC_SIZE(ioctl); if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous); if (IS_ERR(mnt_ns)) return PTR_ERR(mnt_ns); ns = to_ns_common(mnt_ns); /* Transfer ownership of @mnt_ns reference to @path. */ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (ret) return ret; FD_PREPARE(fdf, O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); if (fdf.err) return fdf.err; /* * If @uinfo is passed return all information about the * mount namespace as well. */ ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); if (ret) return ret; ret = fd_publish(fdf); break; } default: ret = -ENOTTY; } return ret; } int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_common *ns; int res = -ENOENT; const char *name; ns = ns_ops->get(task); if (ns) { name = ns_ops->real_ns_name ? : ns_ops->name; res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; } bool proc_ns_file(const struct file *file) { return file->f_op == &ns_file_operations; } /** * ns_match() - Returns true if current namespace matches dev/ino provided. * @ns: current namespace * @dev: dev_t from nsfs that will be matched against current nsfs * @ino: ino_t from nsfs that will be matched against current nsfs * * Return: true if dev and ino matches the current nsfs. */ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) { return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); } static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); const struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%llu]", ns_ops->name, inode->i_ino); return 0; } static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, .show_path = nsfs_show_path, .drop_inode = inode_just_drop, }; static int nsfs_init_inode(struct inode *inode, void *data) { struct ns_common *ns = data; inode->i_private = data; inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; /* * Bring the namespace subtree back to life if we have to. This * can happen when e.g., all processes using a network namespace * and all namespace files or namespace file bind-mounts have * died but there are still sockets pinning it. The SIOCGSKNS * ioctl on such a socket will resurrect the relevant namespace * subtree. */ __ns_ref_active_get(ns); return 0; } static void nsfs_put_data(void *data) { struct ns_common *ns = data; ns->ops->put(ns); } static const struct stashed_operations nsfs_stashed_ops = { .init_inode = nsfs_init_inode, .put_data = nsfs_put_data, }; #define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) #define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, struct inode *parent) { struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; struct ns_common *ns = inode->i_private; int len = *max_len; if (parent) return FILEID_INVALID; if (len < NSFS_FID_SIZE_U32_VER0) { *max_len = NSFS_FID_SIZE_U32_LATEST; return FILEID_INVALID; } else if (len > NSFS_FID_SIZE_U32_LATEST) { *max_len = NSFS_FID_SIZE_U32_LATEST; } fid->ns_id = ns->ns_id; fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } bool is_current_namespace(struct ns_common *ns) { switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: return current_in_namespace(to_cg_ns(ns)); #endif #ifdef CONFIG_IPC_NS case CLONE_NEWIPC: return current_in_namespace(to_ipc_ns(ns)); #endif case CLONE_NEWNS: return current_in_namespace(to_mnt_ns(ns)); #ifdef CONFIG_NET_NS case CLONE_NEWNET: return current_in_namespace(to_net_ns(ns)); #endif #ifdef CONFIG_PID_NS case CLONE_NEWPID: return current_in_namespace(to_pid_ns(ns)); #endif #ifdef CONFIG_TIME_NS case CLONE_NEWTIME: return current_in_namespace(to_time_ns(ns)); #endif #ifdef CONFIG_USER_NS case CLONE_NEWUSER: return current_in_namespace(to_user_ns(ns)); #endif #ifdef CONFIG_UTS_NS case CLONE_NEWUTS: return current_in_namespace(to_uts_ns(ns)); #endif default: VFS_WARN_ON_ONCE(true); return false; } } static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { struct path path __free(path_put) = {}; struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; struct user_namespace *owning_ns = NULL; struct ns_common *ns; int ret; if (fh_len < NSFS_FID_SIZE_U32_VER0) return NULL; /* Check that any trailing bytes are zero. */ if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, fh_len - NSFS_FID_SIZE_U32_LATEST)) return NULL; switch (fh_type) { case FILEID_NSFS: break; default: return NULL; } if (!fid->ns_id) return NULL; /* Either both are set or both are unset. */ if (!fid->ns_inum != !fid->ns_type) return NULL; scoped_guard(rcu) { ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); if (!ns) return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); if (fid->ns_inum && (fid->ns_inum != ns->inum)) return NULL; if (fid->ns_type && (fid->ns_type != ns->ns_type)) return NULL; /* * This is racy because we're not actually taking an * active reference. IOW, it could happen that the * namespace becomes inactive after this check. * We don't care because nsfs_init_inode() will just * resurrect the relevant namespace tree for us. If it * has been active here we just allow it's resurrection. * We could try to take an active reference here and * then drop it again. But really, why bother. */ if (!ns_get_unless_inactive(ns)) return NULL; } switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) owning_ns = to_cg_ns(ns)->user_ns; break; #endif #ifdef CONFIG_IPC_NS case CLONE_NEWIPC: if (!current_in_namespace(to_ipc_ns(ns))) owning_ns = to_ipc_ns(ns)->user_ns; break; #endif case CLONE_NEWNS: if (!current_in_namespace(to_mnt_ns(ns))) owning_ns = to_mnt_ns(ns)->user_ns; break; #ifdef CONFIG_NET_NS case CLONE_NEWNET: if (!current_in_namespace(to_net_ns(ns))) owning_ns = to_net_ns(ns)->user_ns; break; #endif #ifdef CONFIG_PID_NS case CLONE_NEWPID: if (!current_in_namespace(to_pid_ns(ns))) { owning_ns = to_pid_ns(ns)->user_ns; } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { ns->ops->put(ns); return ERR_PTR(-EPERM); } break; #endif #ifdef CONFIG_TIME_NS case CLONE_NEWTIME: if (!current_in_namespace(to_time_ns(ns))) owning_ns = to_time_ns(ns)->user_ns; break; #endif #ifdef CONFIG_USER_NS case CLONE_NEWUSER: if (!current_in_namespace(to_user_ns(ns))) owning_ns = to_user_ns(ns); break; #endif #ifdef CONFIG_UTS_NS case CLONE_NEWUTS: if (!current_in_namespace(to_uts_ns(ns))) owning_ns = to_uts_ns(ns)->user_ns; break; #endif default: return ERR_PTR(-EOPNOTSUPP); } if (owning_ns && !may_see_all_namespaces()) { ns->ops->put(ns); return ERR_PTR(-EPERM); } /* path_from_stashed() unconditionally consumes the reference. */ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (ret) return ERR_PTR(ret); return no_free_ptr(path.dentry); } static int nsfs_export_permission(struct handle_to_path_ctx *ctx, unsigned int oflags) { /* nsfs_fh_to_dentry() performs all permission checks. */ return 0; } static struct file *nsfs_export_open(const struct path *path, unsigned int oflags) { return file_open_root(path, "", oflags, 0); } static const struct export_operations nsfs_export_operations = { .encode_fh = nsfs_encode_fh, .fh_to_dentry = nsfs_fh_to_dentry, .open = nsfs_export_open, .permission = nsfs_export_permission, }; static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; ctx->s_d_flags |= DCACHE_DONTCACHE; ctx->ops = &nsfs_ops; ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } static struct file_system_type nsfs = { .name = "nsfs",