50 2001 2469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TSTAMP_H #define _NF_CONNTRACK_TSTAMP_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> struct nf_conn_tstamp { u_int64_t start; u_int64_t stop; }; static inline struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP); #else return NULL; #endif } static inline struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP struct net *net = nf_ct_net(ct); if (!net->ct.sysctl_tstamp) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp); #else return NULL; #endif }; #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */ |
225 225 223 1 223 1 2 2 9 1 2 6 4 35 34 30 13 2 1 1 108 106 2 1 24 16 1 1 6 6 6 6 3 3 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/crypto/hooks.c * * Encryption hooks for higher-level filesystem operations. */ #include "fscrypt_private.h" /** * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * * Currently, an encrypted regular file can only be opened if its encryption key * is available; access to the raw encrypted contents is not supported. * Therefore, we first set up the inode's encryption key (if not already done) * and return an error if it's unavailable. * * We also verify that if the parent directory (from the path via which the file * is being opened) is encrypted, then the inode being opened uses the same * encryption policy. This is needed as part of the enforcement that all files * in an encrypted directory tree use the same encryption policy, as a * protection against certain types of offline attacks. Note that this check is * needed even when opening an *unencrypted* file, since it's forbidden to have * an unencrypted file in an encrypted directory. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code */ int fscrypt_file_open(struct inode *inode, struct file *filp) { int err; struct dentry *dentry, *dentry_parent; struct inode *inode_parent; err = fscrypt_require_key(inode); if (err) return err; dentry = file_dentry(filp); /* * Getting a reference to the parent dentry is needed for the actual * encryption policy comparison, but it's expensive on multi-core * systems. Since this function runs on unencrypted files too, start * with a lightweight RCU-mode check for the parent directory being * unencrypted (in which case it's fine for the child to be either * unencrypted, or encrypted with any policy). Only continue on to the * full policy check if the parent directory is actually encrypted. */ rcu_read_lock(); dentry_parent = READ_ONCE(dentry->d_parent); inode_parent = d_inode_rcu(dentry_parent); if (inode_parent != NULL && !IS_ENCRYPTED(inode_parent)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); dentry_parent = dget_parent(dentry); if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) { fscrypt_warn(inode, "Inconsistent encryption context (parent directory: %lu)", d_inode(dentry_parent)->i_ino); err = -EPERM; } dput(dentry_parent); return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inode's key is * available, as it's implied by the dentry not being a no-key name. */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (fscrypt_is_nokey_name(old_dentry) || fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inodes' keys are * available, as it's implied by the dentries not being no-key names. */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) return -EXDEV; } return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname); if (err && err != -ENOENT) return err; fscrypt_prepare_dentry(dentry, fname->is_nokey_name); return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); /** * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup * @dir: the encrypted directory being searched * @dentry: the dentry being looked up in @dir * * This function should be used by the ->lookup and ->atomic_open methods of * filesystems that handle filename encryption and no-key name encoding * themselves and thus can't use fscrypt_prepare_lookup(). Like * fscrypt_prepare_lookup(), this will try to set up the directory's encryption * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. * However, this function doesn't set up a struct fscrypt_name for the filename. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); fscrypt_prepare_dentry(dentry, is_nokey_name); return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); } EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return fscrypt_require_key(d_inode(dentry)); return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed * @oldflags: the old flags * @flags: the new flags * * The caller should be holding i_rwsem for write. * * Return: 0 on success; -errno if the flags change isn't allowed or if * another error occurs. */ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; /* * When the CASEFOLD flag is set on an encrypted directory, we must * derive the secret key needed for the dirhash. This is only possible * if the directory uses a v2 encryption policy. */ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { err = fscrypt_require_key(inode); if (err) return err; ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); return err; } return 0; } /** * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator * @max_len: space the filesystem has available to store the symlink target * @disk_link: (out) the on-disk symlink target being prepared * * This function computes the size the symlink target will require on-disk, * stores it in @disk_link->len, and validates it against @max_len. An * encrypted symlink may be longer than the original. * * Additionally, @disk_link->name is set to @target if the symlink will be * unencrypted, but left NULL if the symlink will be encrypted. For encrypted * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the * on-disk target later. (The reason for the two-step process is that some * filesystems need to know the size of the symlink target before creating the * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) * * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, * -ENOKEY if the encryption key is missing, or another -errno code if a problem * occurred while setting up the encryption key. */ int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. */ policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) { /* Not encrypted */ disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } if (IS_ERR(policy)) return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't * exceed max_len. Note that for historical reasons, encrypted symlink * targets are prefixed with the ciphertext length, despite this * actually being redundant with i_size. This decreases by 2 bytes the * longest symlink target we can accept. * * We could recover 1 byte by not counting a null terminator, but * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; } EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; /* * fscrypt_prepare_new_inode() should have already set up the new * symlink inode's encryption key. We don't wait until now to do it, * since we may be in a filesystem transaction now. */ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ sd = (struct fscrypt_symlink_data *)disk_link->name; } else { sd = kmalloc(disk_link->len, GFP_NOFS); if (!sd) return -ENOMEM; } ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) goto err_free_sd; /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well * initialize it just in case the filesystem writes it out. */ sd->encrypted_path[ciphertext_len] = '\0'; /* Cache the plaintext symlink target for later use by get_link() */ err = -ENOMEM; inode->i_link = kmemdup(target, len + 1, GFP_NOFS); if (!inode->i_link) goto err_free_sd; if (!disk_link->name) disk_link->name = (unsigned char *)sd; return 0; err_free_sd: if (!disk_link->name) kfree(sd); return err; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer * @done: if successful, will be set up to free the returned target if needed * * If the symlink's encryption key is available, we decrypt its target. * Otherwise, we encode its target for presentation. * * This may sleep, so the filesystem must have dropped out of RCU mode already. * * Return: the presentable symlink target or an ERR_PTR() */ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { const struct fscrypt_symlink_data *sd; struct fscrypt_str cstr, pstr; bool has_key; int err; /* This is for encrypted symlinks only */ if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ pstr.name = READ_ONCE(inode->i_link); if (pstr.name) return pstr.name; /* * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); /* * For historical reasons, encrypted symlink targets are prefixed with * the ciphertext length, even though this is redundant with i_size. */ if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); if (cstr.len == 0) return ERR_PTR(-EUCLEAN); if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (err) goto err_kfree; err = -EUCLEAN; if (pstr.name[0] == '\0') goto err_kfree; pstr.name[pstr.len] = '\0'; /* * Cache decrypted symlink targets in i_link for later use. Don't cache * symlink targets encoded without the key, since those become outdated * once the key is added. This pairs with the READ_ONCE() above and in * the VFS path lookup code. */ if (!has_key || cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) set_delayed_call(done, kfree_link, pstr.name); return pstr.name; err_kfree: kfree(pstr.name); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); /** * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks * @path: the path for the encrypted symlink being queried * @stat: the struct being filled with the symlink's attributes * * Override st_size of encrypted symlinks to be the length of the decrypted * symlink target (or the no-key encoded symlink target, if the key is * unavailable) rather than the length of the encrypted symlink target. This is * necessary for st_size to match the symlink target that userspace actually * sees. POSIX requires this, and some userspace programs depend on it. * * This requires reading the symlink target from disk if needed, setting up the * inode's encryption key if possible, and then decrypting or encoding the * symlink target. This makes lstat() more heavyweight than is normally the * case. However, decrypted symlink targets will be cached in ->i_link, so * usually the symlink won't have to be read and decrypted again later if/when * it is actually followed, readlink() is called, or lstat() is called again. * * Return: 0 on success, -errno on failure */ int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); const char *link; DEFINE_DELAYED_CALL(done); /* * To get the symlink target that userspace will see (whether it's the * decrypted target or the no-key encoded target), we can just get it in * the same way the VFS does during path resolution and readlink(). */ link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } stat->size = strlen(link); do_delayed_call(&done); return 0; } EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); |
2 2 2 10 3 15 1 15 2 10 13 1 12 10 2 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/verity.c: fs-verity support for ext4 * * Copyright 2019 Google LLC */ /* * Implementation of fsverity_operations for ext4. * * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past * the end of the file, starting at the first 64K boundary beyond i_size. This * approach works because (a) verity files are readonly, and (b) pages fully * beyond i_size aren't visible to userspace but can be read/written internally * by ext4 with only some relatively small changes to ext4. This approach * avoids having to depend on the EA_INODE feature and on rearchitecturing * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and * to support encrypting xattrs. Note that the verity metadata *must* be * encrypted when the file is, since it contains hashes of the plaintext data. * * Using a 64K boundary rather than a 4K one keeps things ready for * architectures with 64K pages, and it doesn't necessarily waste space on-disk * since there can be a hole between i_size and the start of the Merkle tree. */ #include <linux/quotaops.h> #include "ext4.h" #include "ext4_extents.h" #include "ext4_jbd2.h" static inline loff_t ext4_verity_metadata_pos(const struct inode *inode) { return round_up(inode->i_size, 65536); } /* * Read some verity metadata from the inode. __vfs_read() can't be used because * we need to read beyond i_size. */ static int pagecache_read(struct inode *inode, void *buf, size_t count, loff_t pos) { while (count) { struct folio *folio; size_t n; folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(folio)) return PTR_ERR(folio); n = memcpy_from_file_folio(buf, folio, pos, count); folio_put(folio); buf += n; pos += n; count -= n; } return 0; } /* * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. * kernel_write() can't be used because the file descriptor is readonly. */ static int pagecache_write(struct inode *inode, const void *buf, size_t count, loff_t pos) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; if (pos + count > inode->i_sb->s_maxbytes) return -EFBIG; while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; memcpy_to_page(page, offset_in_page(pos), buf, n); res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) return res; if (res != n) return -EIO; buf += n; pos += n; count -= n; } return 0; } static int ext4_begin_enable_verity(struct file *filp) { struct inode *inode = file_inode(filp); const int credits = 2; /* superblock and inode for ext4_orphan_add() */ handle_t *handle; int err; if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EINVAL; if (ext4_verity_in_progress(inode)) return -EBUSY; /* * Since the file was opened readonly, we have to initialize the jbd * inode and quotas here and not rely on ->open() doing it. This must * be done before evicting the inline data. */ err = ext4_inode_attach_jinode(inode); if (err) return err; err = dquot_initialize(inode); if (err) return err; err = ext4_convert_inline_data(inode); if (err) return err; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_warning_inode(inode, "verity is only allowed on extent-based files"); return -EOPNOTSUPP; } /* * ext4 uses the last allocated block to find the verity descriptor, so * we must remove any other blocks past EOF which might confuse things. */ err = ext4_truncate(inode); if (err) return err; handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_orphan_add(handle, inode); if (err == 0) ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); ext4_journal_stop(handle); return err; } /* * ext4 stores the verity descriptor beginning on the next filesystem block * boundary after the Merkle tree. Then, the descriptor size is stored in the * last 4 bytes of the last allocated filesystem block --- which is either the * block in which the descriptor ends, or the next block after that if there * weren't at least 4 bytes remaining. * * We can't simply store the descriptor in an xattr because it *must* be * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt * xattrs. Also, if the descriptor includes a large signature blob it may be * too large to store in an xattr without the EA_INODE feature. */ static int ext4_write_verity_descriptor(struct inode *inode, const void *desc, size_t desc_size, u64 merkle_tree_size) { const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) + merkle_tree_size, i_blocksize(inode)); const u64 desc_end = desc_pos + desc_size; const __le32 desc_size_disk = cpu_to_le32(desc_size); const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk), i_blocksize(inode)) - sizeof(desc_size_disk); int err; err = pagecache_write(inode, desc, desc_size, desc_pos); if (err) return err; return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk), desc_size_pos); } static int ext4_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); const int credits = 2; /* superblock and inode for ext4_orphan_del() */ handle_t *handle; struct ext4_iloc iloc; int err = 0; /* * If an error already occurred (which fs/verity/ signals by passing * desc == NULL), then only clean-up is needed. */ if (desc == NULL) goto cleanup; /* Append the verity descriptor. */ err = ext4_write_verity_descriptor(inode, desc, desc_size, merkle_tree_size); if (err) goto cleanup; /* * Write all pages (both data and verity metadata). Note that this must * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages * beyond i_size won't be written properly. For crash consistency, this * also must happen before the verity inode flag gets persisted. */ err = filemap_write_and_wait(inode->i_mapping); if (err) goto cleanup; /* * Finally, set the verity inode flag and remove the inode from the * orphan list (in a single transaction). */ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto cleanup; } err = ext4_orphan_del(handle, inode); if (err) goto stop_and_cleanup; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto stop_and_cleanup; ext4_set_inode_flag(inode, EXT4_INODE_VERITY); ext4_set_inode_flags(inode, false); err = ext4_mark_iloc_dirty(handle, inode, &iloc); if (err) goto stop_and_cleanup; ext4_journal_stop(handle); ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); return 0; stop_and_cleanup: ext4_journal_stop(handle); cleanup: /* * Verity failed to be enabled, so clean up by truncating any verity * metadata that was written beyond i_size (both from cache and from * disk), removing the inode from the orphan list (if it wasn't done * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS. */ truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); ext4_orphan_del(NULL, inode); ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); return err; } static int ext4_get_verity_descriptor_location(struct inode *inode, size_t *desc_size_ret, u64 *desc_pos_ret) { struct ext4_ext_path *path; struct ext4_extent *last_extent; u32 end_lblk; u64 desc_size_pos; __le32 desc_size_disk; u32 desc_size; u64 desc_pos; int err; /* * Descriptor size is in last 4 bytes of last allocated block. * See ext4_write_verity_descriptor(). */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { EXT4_ERROR_INODE(inode, "verity file doesn't use extents"); return -EFSCORRUPTED; } path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); last_extent = path[path->p_depth].p_ext; if (!last_extent) { EXT4_ERROR_INODE(inode, "verity file has no extents"); ext4_free_ext_path(path); return -EFSCORRUPTED; } end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); desc_size_pos = (u64)end_lblk << inode->i_blkbits; ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) goto bad; desc_size_pos -= sizeof(desc_size_disk); err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk), desc_size_pos); if (err) return err; desc_size = le32_to_cpu(desc_size_disk); /* * The descriptor is stored just before the desc_size_disk, but starting * on a filesystem block boundary. */ if (desc_size > INT_MAX || desc_size > desc_size_pos) goto bad; desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode)); if (desc_pos < ext4_verity_metadata_pos(inode)) goto bad; *desc_size_ret = desc_size; *desc_pos_ret = desc_pos; return 0; bad: EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor"); return -EFSCORRUPTED; } static int ext4_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) { size_t desc_size = 0; u64 desc_pos = 0; int err; err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos); if (err) return err; if (buf_size) { if (desc_size > buf_size) return -ERANGE; err = pagecache_read(inode, buf, desc_size, desc_pos); if (err) return err; } return desc_size; } static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { struct folio *folio; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); if (!IS_ERR(folio)) folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); folio = read_mapping_folio(inode->i_mapping, index, NULL); if (IS_ERR(folio)) return ERR_CAST(folio); } return folio_file_page(folio, index); } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, u64 pos, unsigned int size) { pos += ext4_verity_metadata_pos(inode); return pagecache_write(inode, buf, size, pos); } const struct fsverity_operations ext4_verityops = { .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, .read_merkle_tree_page = ext4_read_merkle_tree_page, .write_merkle_tree_block = ext4_write_merkle_tree_block, }; |
4840 105 4737 4722 1 2303 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic timeout handling of requests. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fault-inject.h> #include "blk.h" #include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT static DECLARE_FAULT_ATTR(fail_io_timeout); static int __init setup_fail_io_timeout(char *str) { return setup_fault_attr(&fail_io_timeout, str); } __setup("fail_io_timeout=", setup_fail_io_timeout); bool __blk_should_fake_timeout(struct request_queue *q) { return should_fail(&fail_io_timeout, 1); } EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", NULL, &fail_io_timeout); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); return sprintf(buf, "%d\n", set != 0); } ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); int val; if (count) { struct request_queue *q = disk->queue; char *p = (char *) buf; val = simple_strtoul(p, &p, 10); if (val) blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); else blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); } return count; } #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { /* * All we need to ensure is that timeout scan takes place * immediately and that scan sees the new timeout value. * No need for fancy synchronizations. */ WRITE_ONCE(req->deadline, jiffies); kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); static unsigned long blk_timeout_mask __read_mostly; static int __init blk_timeout_init(void) { blk_timeout_mask = roundup_pow_of_two(HZ) - 1; return 0; } late_initcall(blk_timeout_init); /* * Just a rough estimate, we don't care about specific values for timeouts. */ static inline unsigned long blk_round_jiffies(unsigned long j) { return (j + blk_timeout_mask) + 1; } unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; return timeout; } /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. * * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. */ if (!req->timeout) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; expiry = jiffies + req->timeout; WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { unsigned long diff = q->timeout.expires - expiry; /* * Due to added timer slack to group timers, the timer * will often be a little in front of what we asked for. * So apply some tolerance here too, otherwise we keep * modifying the timer because expires for value X * will be X + something. */ if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) mod_timer(&q->timeout, expiry); } } |
4 3 5 2 6 6 2 6 6 8 2 1 6 6 6 4 1 16 16 3 10 6 10 2 8 2 1 5 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/media/radio/si470x/radio-si470x-usb.c * * USB driver for radios with Silicon Labs Si470x FM Radio Receivers * * Copyright (c) 2009 Tobias Lorenz <tobias.lorenz@gmx.net> */ /* * ToDo: * - add firmware download/update support */ /* driver definitions */ #define DRIVER_AUTHOR "Tobias Lorenz <tobias.lorenz@gmx.net>" #define DRIVER_CARD "Silicon Labs Si470x FM Radio" #define DRIVER_DESC "USB radio driver for Si470x FM Radio Receivers" #define DRIVER_VERSION "1.0.10" /* kernel includes */ #include <linux/usb.h> #include <linux/hid.h> #include <linux/slab.h> #include "radio-si470x.h" /* USB Device ID List */ static const struct usb_device_id si470x_usb_driver_id_table[] = { /* Silicon Labs USB FM Radio Reference Design */ { USB_DEVICE_AND_INTERFACE_INFO(0x10c4, 0x818a, USB_CLASS_HID, 0, 0) }, /* ADS/Tech FM Radio Receiver (formerly Instant FM Music) */ { USB_DEVICE_AND_INTERFACE_INFO(0x06e1, 0xa155, USB_CLASS_HID, 0, 0) }, /* KWorld USB FM Radio SnapMusic Mobile 700 (FM700) */ { USB_DEVICE_AND_INTERFACE_INFO(0x1b80, 0xd700, USB_CLASS_HID, 0, 0) }, /* Sanei Electric, Inc. FM USB Radio (sold as DealExtreme.com PCear) */ { USB_DEVICE_AND_INTERFACE_INFO(0x10c5, 0x819a, USB_CLASS_HID, 0, 0) }, /* Axentia ALERT FM USB Receiver */ { USB_DEVICE_AND_INTERFACE_INFO(0x12cf, 0x7111, USB_CLASS_HID, 0, 0) }, /* Terminating entry */ { } }; MODULE_DEVICE_TABLE(usb, si470x_usb_driver_id_table); /************************************************************************** * Module Parameters **************************************************************************/ /* Radio Nr */ static int radio_nr = -1; module_param(radio_nr, int, 0444); MODULE_PARM_DESC(radio_nr, "Radio Nr"); /* USB timeout */ static unsigned int usb_timeout = 500; module_param(usb_timeout, uint, 0644); MODULE_PARM_DESC(usb_timeout, "USB timeout (ms): *500*"); /* RDS buffer blocks */ static unsigned int rds_buf = 100; module_param(rds_buf, uint, 0444); MODULE_PARM_DESC(rds_buf, "RDS buffer entries: *100*"); /* RDS maximum block errors */ static unsigned short max_rds_errors = 1; /* 0 means 0 errors requiring correction */ /* 1 means 1-2 errors requiring correction (used by original USBRadio.exe) */ /* 2 means 3-5 errors requiring correction */ /* 3 means 6+ errors or errors in checkword, correction not possible */ module_param(max_rds_errors, ushort, 0644); MODULE_PARM_DESC(max_rds_errors, "RDS maximum block errors: *1*"); /************************************************************************** * USB HID Reports **************************************************************************/ /* Reports 1-16 give direct read/write access to the 16 Si470x registers */ /* with the (REPORT_ID - 1) corresponding to the register address across USB */ /* endpoint 0 using GET_REPORT and SET_REPORT */ #define REGISTER_REPORT_SIZE (RADIO_REGISTER_SIZE + 1) #define REGISTER_REPORT(reg) ((reg) + 1) /* Report 17 gives direct read/write access to the entire Si470x register */ /* map across endpoint 0 using GET_REPORT and SET_REPORT */ #define ENTIRE_REPORT_SIZE (RADIO_REGISTER_NUM * RADIO_REGISTER_SIZE + 1) #define ENTIRE_REPORT 17 /* Report 18 is used to send the lowest 6 Si470x registers up the HID */ /* interrupt endpoint 1 to Windows every 20 milliseconds for status */ #define RDS_REPORT_SIZE (RDS_REGISTER_NUM * RADIO_REGISTER_SIZE + 1) #define RDS_REPORT 18 /* Report 19: LED state */ #define LED_REPORT_SIZE 3 #define LED_REPORT 19 /* Report 19: stream */ #define STREAM_REPORT_SIZE 3 #define STREAM_REPORT 19 /* Report 20: scratch */ #define SCRATCH_PAGE_SIZE 63 #define SCRATCH_REPORT_SIZE (SCRATCH_PAGE_SIZE + 1) #define SCRATCH_REPORT 20 /* Reports 19-22: flash upgrade of the C8051F321 */ #define WRITE_REPORT_SIZE 4 #define WRITE_REPORT 19 #define FLASH_REPORT_SIZE 64 #define FLASH_REPORT 20 #define CRC_REPORT_SIZE 3 #define CRC_REPORT 21 #define RESPONSE_REPORT_SIZE 2 #define RESPONSE_REPORT 22 /* Report 23: currently unused, but can accept 60 byte reports on the HID */ /* interrupt out endpoint 2 every 1 millisecond */ #define UNUSED_REPORT 23 #define MAX_REPORT_SIZE 64 /************************************************************************** * Software/Hardware Versions from Scratch Page **************************************************************************/ #define RADIO_HW_VERSION 1 /************************************************************************** * LED State Definitions **************************************************************************/ #define LED_COMMAND 0x35 #define NO_CHANGE_LED 0x00 #define ALL_COLOR_LED 0x01 /* streaming state */ #define BLINK_GREEN_LED 0x02 /* connect state */ #define BLINK_RED_LED 0x04 #define BLINK_ORANGE_LED 0x10 /* disconnect state */ #define SOLID_GREEN_LED 0x20 /* tuning/seeking state */ #define SOLID_RED_LED 0x40 /* bootload state */ #define SOLID_ORANGE_LED 0x80 /************************************************************************** * Stream State Definitions **************************************************************************/ #define STREAM_COMMAND 0x36 #define STREAM_VIDPID 0x00 #define STREAM_AUDIO 0xff /************************************************************************** * Bootloader / Flash Commands **************************************************************************/ /* unique id sent to bootloader and required to put into a bootload state */ #define UNIQUE_BL_ID 0x34 /* mask for the flash data */ #define FLASH_DATA_MASK 0x55 /* bootloader commands */ #define GET_SW_VERSION_COMMAND 0x00 #define SET_PAGE_COMMAND 0x01 #define ERASE_PAGE_COMMAND 0x02 #define WRITE_PAGE_COMMAND 0x03 #define CRC_ON_PAGE_COMMAND 0x04 #define READ_FLASH_BYTE_COMMAND 0x05 #define RESET_DEVICE_COMMAND 0x06 #define GET_HW_VERSION_COMMAND 0x07 #define BLANK 0xff /* bootloader command responses */ #define COMMAND_OK 0x01 #define COMMAND_FAILED 0x02 #define COMMAND_PENDING 0x03 /************************************************************************** * General Driver Functions - REGISTER_REPORTs **************************************************************************/ /* * si470x_get_report - receive a HID report */ static int si470x_get_report(struct si470x_device *radio, void *buf, int size) { unsigned char *report = buf; int retval; retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), HID_REQ_GET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, report[0], 2, buf, size, usb_timeout); if (retval < 0) dev_warn(&radio->intf->dev, "si470x_get_report: usb_control_msg returned %d\n", retval); return retval; } /* * si470x_set_report - send a HID report */ static int si470x_set_report(struct si470x_device *radio, void *buf, int size) { unsigned char *report = buf; int retval; retval = usb_control_msg(radio->usbdev, usb_sndctrlpipe(radio->usbdev, 0), HID_REQ_SET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, report[0], 2, buf, size, usb_timeout); if (retval < 0) dev_warn(&radio->intf->dev, "si470x_set_report: usb_control_msg returned %d\n", retval); return retval; } /* * si470x_get_register - read register */ static int si470x_get_register(struct si470x_device *radio, int regnr) { int retval; radio->usb_buf[0] = REGISTER_REPORT(regnr); retval = si470x_get_report(radio, radio->usb_buf, REGISTER_REPORT_SIZE); if (retval >= 0) radio->registers[regnr] = get_unaligned_be16(&radio->usb_buf[1]); return (retval < 0) ? -EINVAL : 0; } /* * si470x_set_register - write register */ static int si470x_set_register(struct si470x_device *radio, int regnr) { int retval; radio->usb_buf[0] = REGISTER_REPORT(regnr); put_unaligned_be16(radio->registers[regnr], &radio->usb_buf[1]); retval = si470x_set_report(radio, radio->usb_buf, REGISTER_REPORT_SIZE); return (retval < 0) ? -EINVAL : 0; } /************************************************************************** * General Driver Functions - ENTIRE_REPORT **************************************************************************/ /* * si470x_get_all_registers - read entire registers */ static int si470x_get_all_registers(struct si470x_device *radio) { int retval; unsigned char regnr; radio->usb_buf[0] = ENTIRE_REPORT; retval = si470x_get_report(radio, radio->usb_buf, ENTIRE_REPORT_SIZE); if (retval >= 0) for (regnr = 0; regnr < RADIO_REGISTER_NUM; regnr++) radio->registers[regnr] = get_unaligned_be16( &radio->usb_buf[regnr * RADIO_REGISTER_SIZE + 1]); return (retval < 0) ? -EINVAL : 0; } /************************************************************************** * General Driver Functions - LED_REPORT **************************************************************************/ /* * si470x_set_led_state - sets the led state */ static int si470x_set_led_state(struct si470x_device *radio, unsigned char led_state) { int retval; radio->usb_buf[0] = LED_REPORT; radio->usb_buf[1] = LED_COMMAND; radio->usb_buf[2] = led_state; retval = si470x_set_report(radio, radio->usb_buf, LED_REPORT_SIZE); return (retval < 0) ? -EINVAL : 0; } /************************************************************************** * General Driver Functions - SCRATCH_REPORT **************************************************************************/ /* * si470x_get_scratch_versions - gets the scratch page and version infos */ static int si470x_get_scratch_page_versions(struct si470x_device *radio) { int retval; radio->usb_buf[0] = SCRATCH_REPORT; retval = si470x_get_report(radio, radio->usb_buf, SCRATCH_REPORT_SIZE); if (retval < 0) dev_warn(&radio->intf->dev, "si470x_get_scratch: si470x_get_report returned %d\n", retval); else { radio->software_version = radio->usb_buf[1]; radio->hardware_version = radio->usb_buf[2]; } return (retval < 0) ? -EINVAL : 0; } /************************************************************************** * RDS Driver Functions **************************************************************************/ /* * si470x_int_in_callback - rds callback and processing function * * TODO: do we need to use mutex locks in some sections? */ static void si470x_int_in_callback(struct urb *urb) { struct si470x_device *radio = urb->context; int retval; unsigned char regnr; unsigned char blocknum; unsigned short bler; /* rds block errors */ unsigned short rds; unsigned char tmpbuf[3]; if (urb->status) { if (urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -ESHUTDOWN) { return; } else { dev_warn(&radio->intf->dev, "non-zero urb status (%d)\n", urb->status); goto resubmit; /* Maybe we can recover. */ } } /* Sometimes the device returns len 0 packets */ if (urb->actual_length != RDS_REPORT_SIZE) goto resubmit; radio->registers[STATUSRSSI] = get_unaligned_be16(&radio->int_in_buffer[1]); if (radio->registers[STATUSRSSI] & STATUSRSSI_STC) complete(&radio->completion); if ((radio->registers[SYSCONFIG1] & SYSCONFIG1_RDS)) { /* Update RDS registers with URB data */ for (regnr = 1; regnr < RDS_REGISTER_NUM; regnr++) radio->registers[STATUSRSSI + regnr] = get_unaligned_be16(&radio->int_in_buffer[ regnr * RADIO_REGISTER_SIZE + 1]); /* get rds blocks */ if ((radio->registers[STATUSRSSI] & STATUSRSSI_RDSR) == 0) { /* No RDS group ready, better luck next time */ goto resubmit; } if ((radio->registers[STATUSRSSI] & STATUSRSSI_RDSS) == 0) { /* RDS decoder not synchronized */ goto resubmit; } for (blocknum = 0; blocknum < 4; blocknum++) { switch (blocknum) { default: bler = (radio->registers[STATUSRSSI] & STATUSRSSI_BLERA) >> 9; rds = radio->registers[RDSA]; break; case 1: bler = (radio->registers[READCHAN] & READCHAN_BLERB) >> 14; rds = radio->registers[RDSB]; break; case 2: bler = (radio->registers[READCHAN] & READCHAN_BLERC) >> 12; rds = radio->registers[RDSC]; break; case 3: bler = (radio->registers[READCHAN] & READCHAN_BLERD) >> 10; rds = radio->registers[RDSD]; break; } /* Fill the V4L2 RDS buffer */ put_unaligned_le16(rds, &tmpbuf); tmpbuf[2] = blocknum; /* offset name */ tmpbuf[2] |= blocknum << 3; /* received offset */ if (bler > max_rds_errors) tmpbuf[2] |= 0x80; /* uncorrectable errors */ else if (bler > 0) tmpbuf[2] |= 0x40; /* corrected error(s) */ /* copy RDS block to internal buffer */ memcpy(&radio->buffer[radio->wr_index], &tmpbuf, 3); radio->wr_index += 3; /* wrap write pointer */ if (radio->wr_index >= radio->buf_size) radio->wr_index = 0; /* check for overflow */ if (radio->wr_index == radio->rd_index) { /* increment and wrap read pointer */ radio->rd_index += 3; if (radio->rd_index >= radio->buf_size) radio->rd_index = 0; } } if (radio->wr_index != radio->rd_index) wake_up_interruptible(&radio->read_queue); } resubmit: /* Resubmit if we're still running. */ if (radio->int_in_running && radio->usbdev) { retval = usb_submit_urb(radio->int_in_urb, GFP_ATOMIC); if (retval) { dev_warn(&radio->intf->dev, "resubmitting urb failed (%d)", retval); radio->int_in_running = 0; } } radio->status_rssi_auto_update = radio->int_in_running; } static int si470x_fops_open(struct file *file) { return v4l2_fh_open(file); } static int si470x_fops_release(struct file *file) { return v4l2_fh_release(file); } static void si470x_usb_release(struct v4l2_device *v4l2_dev) { struct si470x_device *radio = container_of(v4l2_dev, struct si470x_device, v4l2_dev); usb_free_urb(radio->int_in_urb); v4l2_ctrl_handler_free(&radio->hdl); v4l2_device_unregister(&radio->v4l2_dev); kfree(radio->int_in_buffer); kfree(radio->buffer); kfree(radio->usb_buf); kfree(radio); } /************************************************************************** * Video4Linux Interface **************************************************************************/ /* * si470x_vidioc_querycap - query device capabilities */ static int si470x_vidioc_querycap(struct file *file, void *priv, struct v4l2_capability *capability) { struct si470x_device *radio = video_drvdata(file); strscpy(capability->driver, DRIVER_NAME, sizeof(capability->driver)); strscpy(capability->card, DRIVER_CARD, sizeof(capability->card)); usb_make_path(radio->usbdev, capability->bus_info, sizeof(capability->bus_info)); return 0; } static int si470x_start_usb(struct si470x_device *radio) { int retval; /* initialize interrupt urb */ usb_fill_int_urb(radio->int_in_urb, radio->usbdev, usb_rcvintpipe(radio->usbdev, radio->int_in_endpoint->bEndpointAddress), radio->int_in_buffer, le16_to_cpu(radio->int_in_endpoint->wMaxPacketSize), si470x_int_in_callback, radio, radio->int_in_endpoint->bInterval); radio->int_in_running = 1; mb(); retval = usb_submit_urb(radio->int_in_urb, GFP_KERNEL); if (retval) { dev_info(&radio->intf->dev, "submitting int urb failed (%d)\n", retval); radio->int_in_running = 0; } radio->status_rssi_auto_update = radio->int_in_running; /* start radio */ retval = si470x_start(radio); if (retval < 0) return retval; v4l2_ctrl_handler_setup(&radio->hdl); return retval; } /************************************************************************** * USB Interface **************************************************************************/ /* * si470x_usb_driver_probe - probe for the device */ static int si470x_usb_driver_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct si470x_device *radio; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; int i, int_end_size, retval; unsigned char version_warning = 0; /* private data allocation and initialization */ radio = kzalloc(sizeof(struct si470x_device), GFP_KERNEL); if (!radio) { retval = -ENOMEM; goto err_initial; } radio->usb_buf = kmalloc(MAX_REPORT_SIZE, GFP_KERNEL); if (radio->usb_buf == NULL) { retval = -ENOMEM; goto err_radio; } radio->usbdev = interface_to_usbdev(intf); radio->intf = intf; radio->band = 1; /* Default to 76 - 108 MHz */ mutex_init(&radio->lock); init_completion(&radio->completion); radio->get_register = si470x_get_register; radio->set_register = si470x_set_register; radio->fops_open = si470x_fops_open; radio->fops_release = si470x_fops_release; radio->vidioc_querycap = si470x_vidioc_querycap; iface_desc = intf->cur_altsetting; /* Set up interrupt endpoint information. */ for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (usb_endpoint_is_int_in(endpoint)) radio->int_in_endpoint = endpoint; } if (!radio->int_in_endpoint) { dev_info(&intf->dev, "could not find interrupt in endpoint\n"); retval = -EIO; goto err_usbbuf; } int_end_size = le16_to_cpu(radio->int_in_endpoint->wMaxPacketSize); radio->int_in_buffer = kmalloc(int_end_size, GFP_KERNEL); if (!radio->int_in_buffer) { dev_info(&intf->dev, "could not allocate int_in_buffer"); retval = -ENOMEM; goto err_usbbuf; } radio->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!radio->int_in_urb) { retval = -ENOMEM; goto err_intbuffer; } radio->v4l2_dev.release = si470x_usb_release; /* * The si470x SiLabs reference design uses the same USB IDs as * 'Thanko's Raremono' si4734 based receiver. So check here which we * have: attempt to read the device ID from the si470x: the lower 12 * bits should be 0x0242 for the si470x. * * We use this check to determine which device we are dealing with. */ if (id->idVendor == 0x10c4 && id->idProduct == 0x818a) { retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), HID_REQ_GET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, 1, 2, radio->usb_buf, 3, 500); if (retval != 3 || (get_unaligned_be16(&radio->usb_buf[1]) & 0xfff) != 0x0242) { dev_info(&intf->dev, "this is not a si470x device.\n"); retval = -ENODEV; goto err_urb; } } retval = v4l2_device_register(&intf->dev, &radio->v4l2_dev); if (retval < 0) { dev_err(&intf->dev, "couldn't register v4l2_device\n"); goto err_urb; } v4l2_ctrl_handler_init(&radio->hdl, 2); v4l2_ctrl_new_std(&radio->hdl, &si470x_ctrl_ops, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 1); v4l2_ctrl_new_std(&radio->hdl, &si470x_ctrl_ops, V4L2_CID_AUDIO_VOLUME, 0, 15, 1, 15); if (radio->hdl.error) { retval = radio->hdl.error; dev_err(&intf->dev, "couldn't register control\n"); goto err_dev; } radio->videodev = si470x_viddev_template; radio->videodev.ctrl_handler = &radio->hdl; radio->videodev.lock = &radio->lock; radio->videodev.v4l2_dev = &radio->v4l2_dev; radio->videodev.release = video_device_release_empty; radio->videodev.device_caps = V4L2_CAP_HW_FREQ_SEEK | V4L2_CAP_READWRITE | V4L2_CAP_TUNER | V4L2_CAP_RADIO | V4L2_CAP_RDS_CAPTURE; video_set_drvdata(&radio->videodev, radio); /* get device and chip versions */ if (si470x_get_all_registers(radio) < 0) { retval = -EIO; goto err_ctrl; } dev_info(&intf->dev, "DeviceID=0x%4.4hx ChipID=0x%4.4hx\n", radio->registers[DEVICEID], radio->registers[SI_CHIPID]); if ((radio->registers[SI_CHIPID] & SI_CHIPID_FIRMWARE) < RADIO_FW_VERSION) { dev_warn(&intf->dev, "This driver is known to work with firmware version %u, but the device has firmware version %u.\n", RADIO_FW_VERSION, radio->registers[SI_CHIPID] & SI_CHIPID_FIRMWARE); version_warning = 1; } /* get software and hardware versions */ if (si470x_get_scratch_page_versions(radio) < 0) { retval = -EIO; goto err_ctrl; } dev_info(&intf->dev, "software version %d, hardware version %d\n", radio->software_version, radio->hardware_version); if (radio->hardware_version < RADIO_HW_VERSION) { dev_warn(&intf->dev, "This driver is known to work with hardware version %u, but the device has hardware version %u.\n", RADIO_HW_VERSION, radio->hardware_version); version_warning = 1; } /* give out version warning */ if (version_warning == 1) { dev_warn(&intf->dev, "If you have some trouble using this driver, please report to V4L ML at linux-media@vger.kernel.org\n"); } /* set led to connect state */ si470x_set_led_state(radio, BLINK_GREEN_LED); /* rds buffer allocation */ radio->buf_size = rds_buf * 3; radio->buffer = kmalloc(radio->buf_size, GFP_KERNEL); if (!radio->buffer) { retval = -EIO; goto err_ctrl; } /* rds buffer configuration */ radio->wr_index = 0; radio->rd_index = 0; init_waitqueue_head(&radio->read_queue); usb_set_intfdata(intf, radio); /* start radio */ retval = si470x_start_usb(radio); if (retval < 0 && !radio->int_in_running) goto err_buf; else if (retval < 0) /* in case of radio->int_in_running == 1 */ goto err_all; /* set initial frequency */ si470x_set_freq(radio, 87.5 * FREQ_MUL); /* available in all regions */ /* register video device */ retval = video_register_device(&radio->videodev, VFL_TYPE_RADIO, radio_nr); if (retval) { dev_err(&intf->dev, "Could not register video device\n"); goto err_all; } return 0; err_all: usb_kill_urb(radio->int_in_urb); err_buf: kfree(radio->buffer); err_ctrl: v4l2_ctrl_handler_free(&radio->hdl); err_dev: v4l2_device_unregister(&radio->v4l2_dev); err_urb: usb_free_urb(radio->int_in_urb); err_intbuffer: kfree(radio->int_in_buffer); err_usbbuf: kfree(radio->usb_buf); err_radio: kfree(radio); err_initial: return retval; } /* * si470x_usb_driver_suspend - suspend the device */ static int si470x_usb_driver_suspend(struct usb_interface *intf, pm_message_t message) { struct si470x_device *radio = usb_get_intfdata(intf); dev_info(&intf->dev, "suspending now...\n"); /* shutdown interrupt handler */ if (radio->int_in_running) { radio->int_in_running = 0; if (radio->int_in_urb) usb_kill_urb(radio->int_in_urb); } /* cancel read processes */ wake_up_interruptible(&radio->read_queue); /* stop radio */ si470x_stop(radio); return 0; } /* * si470x_usb_driver_resume - resume the device */ static int si470x_usb_driver_resume(struct usb_interface *intf) { struct si470x_device *radio = usb_get_intfdata(intf); int ret; dev_info(&intf->dev, "resuming now...\n"); /* start radio */ ret = si470x_start_usb(radio); if (ret == 0) v4l2_ctrl_handler_setup(&radio->hdl); return ret; } /* * si470x_usb_driver_disconnect - disconnect the device */ static void si470x_usb_driver_disconnect(struct usb_interface *intf) { struct si470x_device *radio = usb_get_intfdata(intf); mutex_lock(&radio->lock); v4l2_device_disconnect(&radio->v4l2_dev); video_unregister_device(&radio->videodev); usb_kill_urb(radio->int_in_urb); usb_set_intfdata(intf, NULL); mutex_unlock(&radio->lock); v4l2_device_put(&radio->v4l2_dev); } /* * si470x_usb_driver - usb driver interface * * A note on suspend/resume: this driver had only empty suspend/resume * functions, and when I tried to test suspend/resume it always disconnected * instead of resuming (using my ADS InstantFM stick). So I've decided to * remove these callbacks until someone else with better hardware can * implement and test this. */ static struct usb_driver si470x_usb_driver = { .name = DRIVER_NAME, .probe = si470x_usb_driver_probe, .disconnect = si470x_usb_driver_disconnect, .suspend = si470x_usb_driver_suspend, .resume = si470x_usb_driver_resume, .reset_resume = si470x_usb_driver_resume, .id_table = si470x_usb_driver_id_table, }; module_usb_driver(si470x_usb_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_VERSION(DRIVER_VERSION); |
50 6 49 41 29 51 43 30 14 43 25 25 25 25 6 5 6 6 6 5 4 1 6 6 6 6 17 10 6 24 3 21 16 8 17 6 5 6 8 17 19 2 11 6 13 13 13 13 19 4 7 7 7 7 2 2 2 2 2 2 2 2 2 3 3 2 1 1 1 2 2 2 2 2 1 2 24 3 27 27 24 7 17 20 6 5 1 1 5 5 30 1 29 18 18 1 19 45 47 47 31 31 10 30 47 22 46 46 46 19 14 6 12 8 7 19 45 4 22 19 8 32 31 22 18 22 13 14 27 26 1 1 9 9 9 40 39 40 40 40 56 50 23 5 6 6 6 52 6 5 66 54 56 13 46 56 1 3 53 49 11 55 5 52 14 45 1 25 61 1 62 1 1 9 3 8 3 3 8 9 2 3 5 5 2 2 16 2 9 5 7 4 4 4 7 1 7 7 7 16 6 6 6 6 6 2 5 4 5 10 2 8 9 9 16 1 2 13 5 12 2 13 8 8 8 13 13 13 13 4 1 5 8 13 27 19 1 67 25 56 68 8 56 56 56 3 3 56 3 56 56 43 42 41 28 20 1 23 17 7 23 41 42 43 6 42 45 45 43 9 4 11 11 9 10 1 2 9 10 1 5 2 8 38 38 1 37 27 27 26 26 3 25 1 25 25 25 25 25 25 25 25 25 25 25 25 25 1 25 24 3 25 6 18 2 19 1 25 1 25 25 25 25 1 25 25 27 26 3 3 27 27 23 23 23 1 1 20 20 3 3 3 3 3 3 3 3 10 9 3 5 39 24 29 24 12 17 6 10 9 6 10 1 7 3 10 23 40 41 18 2 17 17 18 19 19 19 6 18 13 13 8 8 8 1 8 7 1 1 1 8 1 7 2 7 9 8 7 3 6 72 73 13 13 13 12 13 13 3 11 11 11 13 13 13 13 13 14 4 10 32 21 21 21 21 21 21 2 1 1 2 1 1 3 3 3 3 3 3 2 1 1 3 2 1 3 2 1 3 3 2 2 2 2 2 2 2 3 3 3 6 6 6 6 6 2 6 6 6 73 75 41 42 1 1 1 14 14 14 13 14 7 3 9 2 1 9 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "dir-item.h" #include "file-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "file.h" #include "acl.h" #include "relocation.h" #include "verity.h" #include "super.h" #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" #include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; }; /* * Used by data_reloc_print_warning_inode() to pass needed info for filename * resolution and output of error message. */ struct data_reloc_warn { struct btrfs_path path; struct btrfs_fs_info *fs_info; u64 extent_item_size; u64 logical; int mirror_num; }; /* * For the file_extent_tree, we want to hold the inode lock when we lookup and * update the disk_i_size, but lockdep will complain because our io_tree we hold * the tree lock and get the inode lock when setting delalloc. These two things * are unrelated, so make a class for the file_extent_tree so we don't get the * two locking patterns mixed up. */ static struct lock_class_key file_extent_tree_class; static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) { struct data_reloc_warn *warn = warn_ctx; struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; u32 nlink; int ret; local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; } /* This makes the path point to (inum INODE_ITEM ioff). */ key.objectid = inum; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); if (ret) { btrfs_put_root(local_root); btrfs_release_path(&warn->path); goto err; } eb = warn->path.nodes[0]; inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(&warn->path); nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, &warn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; /* * -ENOMEM, not a critical error, just output an generic error * without filename. */ btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", warn->logical, warn->mirror_num, root, inum, offset); return ret; } ret = paths_from_inode(inum, ipath); if (ret < 0) goto err; /* * We deliberately ignore the bit ipath might have been too small to * hold all of the paths here */ for (int i = 0; i < ipath->fspath->elem_cnt; i++) { btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", warn->logical, warn->mirror_num, root, inum, offset, fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); } btrfs_put_root(local_root); free_ipath(ipath); return 0; err: btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); free_ipath(ipath); return ret; } /* * Do extra user-friendly error output (e.g. lookup all the affected files). * * Return true if we succeeded doing the backref lookup. * Return false if such lookup failed, and has to fallback to the old error message. */ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, const u8 *csum, const u8 *csum_expected, int mirror_num) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_path path = { 0 }; struct btrfs_key found_key = { 0 }; struct extent_buffer *eb; struct btrfs_extent_item *ei; const u32 csum_size = fs_info->csum_size; u64 logical; u64 flags; u32 item_size; int ret; mutex_lock(&fs_info->reloc_mutex); logical = btrfs_get_reloc_bg_bytenr(fs_info); mutex_unlock(&fs_info->reloc_mutex); if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); return; } eb = path.nodes[0]; ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); item_size = btrfs_item_size(eb, path.slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsigned long ptr = 0; u64 ref_root; u8 ref_level; while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); if (ret < 0) { btrfs_warn_rl(fs_info, "failed to resolve tree backref for logical %llu: %d", logical, ret); break; } if (ret > 0) break; btrfs_warn_rl(fs_info, "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", logical, mirror_num, (ref_level ? "node" : "leaf"), ref_level, ref_root); } btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; struct data_reloc_warn reloc_warn = { 0 }; btrfs_release_path(&path); ctx.bytenr = found_key.objectid; ctx.extent_item_pos = logical - found_key.objectid; ctx.fs_info = fs_info; reloc_warn.logical = logical; reloc_warn.extent_item_size = found_key.offset; reloc_warn.mirror_num = mirror_num; reloc_warn.fs_info = fs_info; iterate_extent_inodes(&ctx, true, data_reloc_print_warning_inode, &reloc_warn); } } static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } /* * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) down_write(&inode->i_mmap_lock); return 0; } /* * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(&inode->vfs_inode); else inode_unlock(&inode->vfs_inode); } /* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { page_start = page_offset(locked_page); page_end = page_start + PAGE_SIZE - 1; } while (index <= end_index) { /* * For locked page, we will call btrfs_mark_ordered_io_finished * through btrfs_mark_ordered_io_finished() on it * in run_delalloc_range() for the error handling, which will * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_folio_clamp_clear_ordered(inode->root->fs_info, page_folio(page), offset, bytes); put_page(page); } if (locked_page) { /* The locked page covers the full range, nothing needs to be done */ if (bytes + offset <= page_start + PAGE_SIZE) return; /* * In case this page belongs to the delalloc range being * instantiated then skip it, since the first page of a range is * going to be properly cleaned up by the caller of * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; offset = page_offset(locked_page) + PAGE_SIZE; } } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { int err; if (args->default_acl) { err = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); if (err) return err; } if (args->acl) { err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); if (err) return err; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); return btrfs_xattr_security_init(trans, args->inode, args->dir, &args->dentry->d_name); } /* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; u64 i_size; /* * The decompressed size must still be no larger than a sector. Under * heavy race, we can have size == 0 passed in, but that shouldn't be a * big deal and we can continue the insertion. */ ASSERT(size <= sectorsize); /* * The compressed size also needs to be no larger than a sector. * That's also why we only need one page as the parameter. */ if (compressed_folio) ASSERT(compressed_size <= sectorsize); else ASSERT(compressed_size == 0); if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { struct btrfs_key key; size_t datasize; key.objectid = btrfs_ino(inode); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) goto fail; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { kaddr = kmap_local_folio(compressed_folio, 0); write_extent_buffer(leaf, kaddr, ptr, compressed_size); kunmap_local(kaddr); btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* * We align size to sectorsize for inline extents just for simplicity * sake. */ ret = btrfs_inode_set_file_extent_range(inode, 0, ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* * We're an inline extent, so nobody can extend the file past i_size * without locking a page we already have locked. * * We must do any i_size and inode updates before we unlock the pages. * Otherwise we could end up racing with unlink. */ i_size = i_size_read(&inode->vfs_inode); if (update_i_size && size > i_size) { i_size_write(&inode->vfs_inode, size); i_size = size; } inode->disk_i_size = i_size; fail: return ret; } static bool can_cow_file_range_inline(struct btrfs_inode *inode, u64 offset, u64 size, size_t compressed_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 data_len = (compressed_size ?: size); /* Inline extents must start at offset 0. */ if (offset != 0) return false; /* * Due to the page size limit, for subpage we can only trigger the * writeback for the dirty sectors of page, that means data writeback * is doing more writeback than what we want. * * This is especially unexpected for some call sites like fallocate, * where we only increase i_size after everything is done. * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ if (fs_info->sectorsize != PAGE_SIZE) return false; /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; /* We cannot exceed the user specified max_inline size. */ if (data_len > fs_info->max_inline) return false; /* Inline extents must be the entirety of the file. */ if (size < i_size_read(&inode->vfs_inode)) return false; return true; } /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. * * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } trans->block_rsv = &inode->block_rsv; drop_args.path = path; drop_args.start = 0; drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here. */ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; } static noinline int cow_file_range_inline(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 end, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct extent_state *cached = NULL; unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); int ret; if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, offset, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } if (ret == 0) locked_page = NULL; extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } struct async_extent { u64 start; u64 ram_size; u64 compressed_size; struct folio **folios; unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; struct async_cow *async_cow; }; struct async_cow { atomic_t num_chunks; struct async_chunk chunks[]; }; static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct folio **folios, unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); if (!async_extent) return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->folios = folios; async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } /* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); return 0; } /* * Special check for subpage. * * We lock the full page then run each delalloc range in the page, thus * for the following case, we will hit some subpage specific corner case: * * 0 32K 64K * | |///////| |///////| * \- A \- B * * In above case, both range A and range B will try to unlock the full * page [0, 64K), causing the one finished later will have page * unlocked already, triggering various page lock requirement BUG_ON()s. * * So here we add an artificial limit that subpage compression can only * if the range is fully page aligned. * * In theory we only need to ensure the first page is fully covered, but * the tailing partial page will be locked until the full compression * finishes, delaying the write of other range. * * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range * first to prevent any submitted async extent to unlock the full page. * By this, we can ensure for subpage case that only the last async_cow * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ if (inode->defrag_compress) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) return btrfs_compress_heuristic(inode, start, end); return 0; } static inline void inode_should_defrag(struct btrfs_inode *inode, u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(NULL, inode, small_write); } static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct page *page; int ret = 0; for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { page = find_get_page(inode->i_mapping, index); if (unlikely(!page)) { if (!ret) ret = -ENOENT; continue; } clear_page_dirty_for_io(page); put_page(page); } return ret; } /* * Work queue call back to started compression on a file and pages. * * This is done inside an ordered work queue, and the compression is spread * across many cpus. The actual IO submission is step two, and the ordered work * queue takes care of making sure that happens in the same order things were * put onto the queue by writepages and friends. * * If this code finds it can't get good compression, it puts an entry onto the * work queue to write the uncompressed bytes. This makes sure that both * compressed inodes and uncompressed inodes are written in the same order that * the flusher thread sent them down. */ static void compress_file_range(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; int ret = 0; struct folio **folios; unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; int i; int compress_type = fs_info->compress_type; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to call clear_page_dirty_for_io on each page in the range. * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); /* * All the folios should have been locked thus no failure. * * And even if some folios are missing, btrfs_compress_folios() * would handle them correctly, so here just do an ASSERT() check for * early logic errors. */ ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us. */ barrier(); i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner. */ if (actual_end <= start) goto cleanup_and_bail_uncompressed; total_compressed = actual_end - start; /* * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* * For subpage case, we require full page alignment for the sector * aligned range. * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; ret = 0; /* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios. */ if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. */ goto cleanup_and_bail_uncompressed; } if (inode->defrag_compress) compress_type = inode->defrag_compress; else if (inode->prop_compress) compress_type = inode->prop_compress; /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* * Zero the tail end of the last page, as we might be sending it down * to disk. */ poff = offset_in_page(total_compressed); if (poff) folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. * * If we didn't compress the entire range, try to create an uncompressed * inline extent, else a compressed one. * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ if (total_in < actual_end) ret = cow_file_range_inline(inode, NULL, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); else ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, compress_type, folios[0], false); if (ret <= 0) { if (ret < 0) mapping_set_error(mapping, -EIO); goto free_pages; } /* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things. */ total_compressed = ALIGN(total_compressed, blocksize); /* * One last check to make sure the compression is really a win, compare * the page count read with the blocks on disk, compression must free at * least one sector. */ total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) goto mark_incompressible; /* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); goto again; } return; mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: if (folios) { for (i = 0; i < nr_folios; i++) { WARN_ON(folios[i]->mapping); btrfs_free_compr_folio(folios[i]); } kfree(folios); } } static void free_async_extent_pages(struct async_extent *async_extent) { int i; if (!async_extent->folios) return; for (i = 0; i < async_extent->nr_folios; i++) { WARN_ON(async_extent->folios[i]->mapping); btrfs_free_compr_folio(async_extent->folios[i]); } kfree(async_extent->folios); async_extent->nr_folios = 0; async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end, .no_cgroup_owner = 1, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); set_page_writeback(locked_page); end_page_writeback(locked_page); btrfs_mark_ordered_io_finished(inode, locked_page, page_start, PAGE_SIZE, !ret); mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } } } static void submit_one_async_extent(struct async_chunk *async_chunk, struct async_extent *async_extent, u64 *alloc_hint) { struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; if (async_chunk->blkcg_css) kthread_associate_blkcg(async_chunk->blkcg_css); /* * If async_chunk->locked_page is in the async_extent range, we need to * handle it. */ if (async_chunk->locked_page) { u64 locked_page_start = page_offset(async_chunk->locked_page); u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { submit_uncompressed_range(inode, async_extent, locked_page); goto done; } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { /* * We can't reserve contiguous space for the compressed size. * Unlikely, but it's possible that we could have enough * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_page); goto done; } lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.ram_bytes = async_extent->ram_size; file_extent.num_bytes = async_extent->ram_size; file_extent.offset = 0; file_extent.compression = async_extent->compress_type; em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1 << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, async_extent->folios, /* compressed_folios */ async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; read_lock(&em_tree->lock); em = search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) alloc_hint = extent_map_block_start(em); if (em) free_extent_map(em); } else { alloc_hint = extent_map_block_start(em); free_extent_map(em); } } read_unlock(&em_tree->lock); return alloc_hint; } /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * When this function fails, it unlocks all pages except @locked_page. * * When this function successfully creates an inline extent, it returns 1 and * unlocks all pages including locked_page and starts I/O on them. * (In reality inline extents are limited to a single page, so locked_page is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept * intact. So, the caller must clean them up by calling * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsigned long page_ops; bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(inode, start, end, num_bytes, SZ_64K); if (!no_inline) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_page, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done * with this page and already handled the IO. * * If there was an error then cow_file_range_inline() has * already done the cleanup. */ if (ret == 0) ret = 1; goto done; } } alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items. */ if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned * file systems, which is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at * least one zone to finish before retrying the * allocation. Otherwise ask the caller to write out * the already allocated blocks before coming back to * us, or return -ENOSPC if it can't handle retries. */ ASSERT(btrfs_is_zoned(fs_info)); if (start == orig_start) { wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, TASK_UNINTERRUPTIBLE); continue; } if (done_offset) { *done_offset = start - 1; return 0; } ret = -ENOSPC; } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; extent_reserved = true; ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; file_extent.ram_bytes = ins.offset; file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; lock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(ordered); /* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent. */ if (ret) btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* * We're not doing compressed IO, don't unlock the first page * (which the caller expects to stay locked), don't clear any * dirty bits and don't set any writeback bits * * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; extent_reserved = false; /* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit. */ if (ret) goto out_unlock; } done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below. */ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_page, NULL, 0, page_ops); } /* * At this point we're unlocked, we want to make sure we're only * clearing these flags under the extent lock, so lock the rest of the * range and clear everything up. */ lock_extent(&inode->io_tree, start, end, NULL); /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_page, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; } /* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_page, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; } /* * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. * * If called with @do_free == true then it'll try to finish the work and free * the work struct eventually. */ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); struct async_extent *async_extent; unsigned long nr_pages; u64 alloc_hint = 0; if (do_free) { struct async_cow *async_cow; btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); async_cow = async_chunk->async_cow; if (atomic_dec_and_test(&async_cow->num_chunks)) kvfree(async_cow); return; } nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); } static bool run_delalloc_compressed(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!ctx) return false; set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; INIT_LIST_HEAD(&async_chunk[i].extents); /* * The locked_page comes all the way from writepage and its * the original page we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to locked_page * * This way we don't need racey decisions about who is supposed * to unlock it. */ if (locked_page) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page. */ wbc_account_cgroup_owner(wbc, locked_page, cur_end - start); async_chunk[i].locked_page = locked_page; locked_page = NULL; } else { async_chunk[i].locked_page = NULL; } if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; } else { async_chunk[i].blkcg_css = NULL; } btrfs_init_work(&async_chunk[i].work, compress_file_range, submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); start = cur_end + 1; } return true; } /* * Run the delalloc range from start to end, and write back any dirty pages * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { u64 done_offset = end; int ret; while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, &done_offset, true, false); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_page, start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ lock_extent(io_tree, start, end, &cached_state); count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) bytes = range_bytes; spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); ASSERT(ret != 1); return ret; } struct can_nocow_file_extent_args { /* Input fields. */ /* Start file offset of the range we want to NOCOW. */ u64 start; /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; /* * Output fields. Only set when can_nocow_file_extent() returns 1. * The expected file extent for the NOCOW write. */ struct btrfs_file_extent file_extent; }; /* * Check if we can NOCOW the file extent that the path points to. * This function may return with the path released, so the caller should check * if path->nodes[0] is NULL or not if it needs to use the path afterwards. * * Returns: < 0 on error * 0 if we can not NOCOW * 1 if we can NOCOW */ static int can_nocow_file_extent(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_inode *inode, struct can_nocow_file_extent_args *args) { const bool is_freespace_inode = btrfs_is_free_space_inode(inode); struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; struct btrfs_root *csum_root; u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; int ret = 0; bool nowait = path->nowait; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; /* * If the extent was created before the generation where the last snapshot * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ if (!args->strict && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out; extent_end = btrfs_file_extent_end(path); args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid * blocking other tasks for too long. */ btrfs_release_path(path); ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), key->offset - args->file_extent.offset, args->file_extent.disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; if (args->free_path) { /* * We don't need the path anymore, plus through the * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ btrfs_free_path(path); path = NULL; } /* If there are pending snapshots for this root, we must COW. */ if (args->writeback_path && !is_freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out; args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; args->file_extent.offset += args->start - key->offset; io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ csum_root = btrfs_csum_root(root->fs_info, io_start); ret = btrfs_lookup_csums_list(csum_root, io_start, io_start + args->file_extent.num_bytes - 1, NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; can_nocow = 1; out: if (args->free_path && path) btrfs_free_path(path); return ret < 0 ? ret : can_nocow; } /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; /* * Normally on a zoned device we're only doing COW writes, but in case * of relocation on a zoned filesystem serializes I/O so that we're only * writing sequentially and can end up here as well. */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; u64 nocow_end; int extent_type; bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; /* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } check_prev = false; next_slot: /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto error; if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; /* * If the found extent starts after requested offset, then * adjust extent_end to be right before this extent begins */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto must_cow; } /* * Found extent which begins before our range and potentially * intersect it */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); /* If this is triggered then we have a memory corruption. */ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { ret = -EUCLEAN; goto error; } extent_end = btrfs_file_extent_end(path); /* * If the extent we got ends before our current offset, skip to * the next extent. */ if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); if (ret < 0) goto error; if (ret == 0) goto must_cow; ret = 0; nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* * If we can't perform NOCOW writeback for the range, * then record the beginning of the range that needs to * be COWed. It will be written out before the next * NOCOW range if we find one, or when exiting this * loop. */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; if (cur_offset > end) break; if (!path->nodes[0]) continue; path->slots[0]++; goto next_slot; } /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); goto error; } } nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { struct extent_map *em; em = btrfs_create_io_em(inode, cur_offset, &nocow_args.file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } free_extent_map(em); } ordered = btrfs_alloc_ordered_extent(inode, cur_offset, &nocow_args.file_extent, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW)); btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } unlock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); ret = PTR_ERR(ordered); goto error; } if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; /* * btrfs_reloc_clone_csums() error, now we're OK to call error * handler, as metadata for created ordered extent will only * be freed by btrfs_finish_ordered_io(). */ if (ret) goto error; } btrfs_release_path(path); if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; if (cow_start != (u64)-1) { cur_offset = end; ret = fallback_to_cow(inode, locked_page, cow_start, end); cow_start = (u64)-1; if (ret) goto error; } btrfs_free_path(path); return 0; error: /* * If an error happened while a COW region is outstanding, cur_offset * needs to be reset to cow_start to ensure the COW region is unlocked * as well. */ if (cow_start != (u64)-1) cur_offset = cow_start; /* * We need to lock the extent here because we're clearing DELALLOC and * we're not locked at this point. */ if (cur_offset < end) { struct extent_state *cached = NULL; lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); return ret; } static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } return false; } /* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* * The range must cover part of the @locked_page, or a return of 1 * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && run_delalloc_compressed(inode, locked_page, start, end, wbc)) return 1; if (zoned) ret = run_delalloc_cow(inode, locked_page, start, end, wbc, true); else ret = cow_file_range(inode, locked_page, start, end, NULL, false, false); out: if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; } void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; lockdep_assert_held(&inode->io_tree.lock); /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; size = orig->end - orig->start + 1; if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; /* * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; num_extents += count_max_extents(fs_info, new_size); if (count_max_extents(fs_info, size) >= num_extents) return; } spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); } /* * Handle merged delayed allocation extents so we can keep track of new extents * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; lockdep_assert_held(&inode->io_tree.lock); /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; if (new->start > other->start) new_size = new->end - other->start + 1; else new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); return; } /* * We have to add up either side to figure out how many extents were * accounted for before we merged into one big extent. If the number of * extents we accounted for is <= the amount we need for the new range * then we can return, otherwise drop. Think of it like this * * [ 4k][MAX_SIZE] * * So we've grown the extent by a MAX_SIZE extent, this would mean we * need 2 outstanding extents, on one side we have 1 and the other side * we have 1 so they are == and we can return. But in this case * * [MAX_SIZE+4k][MAX_SIZE+4k] * * Each range on their own accounts for 2 extents, but merged together * they are only 3 extents worth of accounting, so we need to drop in * this case. */ old_size = other->end - other->start + 1; num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; num_extents += count_max_extents(fs_info, old_size); if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); ASSERT(list_empty(&inode->delalloc_inodes)); list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); ASSERT(list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; lockdep_assert_held(&root->delalloc_lock); /* * We may be called after the inode was already deleted from the list, * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), * and then later through btrfs_clear_delalloc_extent() while the inode * still has ->delalloc_bytes > 0. */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; lockdep_assert_held(&inode->io_tree.lock); if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { u64 len = state->end + 1 - state->start; u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) return; percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; spin_unlock(&inode->lock); /* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_clear_delalloc_extent(). */ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); inode->new_delalloc_bytes += state->end + 1 - state->start; spin_unlock(&inode->lock); } } /* * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); lockdep_assert_held(&inode->io_tree.lock); if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); } /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we * don't need to call delalloc_release_metadata if there is an * error. */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) return; if (!btrfs_is_data_reloc_root(root) && !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); /* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_set_delalloc_extent(). */ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { spin_lock(&root->delalloc_lock); btrfs_del_delalloc_inode(inode); spin_unlock(&root->delalloc_lock); } } if ((state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; } return 0; } static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, const u64 start, const u64 len, struct extent_state **cached_state) { u64 search_start = start; const u64 end = start + len - 1; while (search_start < end) { const u64 search_len = end - search_start + 1; struct extent_map *em; u64 em_len; int ret = 0; em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; if (em->start < search_start) em_len -= search_start - em->start; if (em_len > search_len) em_len = search_len; ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, EXTENT_DELALLOC_NEW, cached_state); next: search_start = extent_map_end(em); free_extent_map(em); if (ret) return ret; } return 0; } int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); if (start >= i_size_read(&inode->vfs_inode) && !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case so just * set the delalloc new bit for the range directly. */ extra_bits |= EXTENT_DELALLOC_NEW; } else { int ret; ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start, cached_state); if (ret) return ret; } return set_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; struct btrfs_inode *inode; struct btrfs_work work; }; static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup = container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page = fixup->page; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = page_offset(page); u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); again: lock_page(page); /* * Before we queued this fixup, we took a reference on the page. * page->mapping may go NULL, but it shouldn't be moved to a different * address space. */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { /* * Unfortunately this is a little tricky, either * * 1) We got here and our page had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. * 2) Our page was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but * because the page was already dealt with we don't want to * mark the page with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC * when the page was already properly dealt with. */ if (!ret) { btrfs_delalloc_release_extents(inode, PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); } ret = 0; goto out_page; } /* * We can't mess with the page state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) goto out_page; lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; /* * Everything went as planned, we're now the owner of a dirty page with * delayed allocation bits set and space reserved for our COW * destination. * * The page was dirty when we started, nothing should have cleaned it. */ BUG_ON(!PageDirty(page)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); btrfs_mark_ordered_io_finished(inode, page, page_start, PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); /* * As a precaution, do a delayed iput in case it would be the last iput * that could need flushing space. Recursing back to fixup worker would * deadlock. */ btrfs_add_delayed_iput(inode); } /* * There are a few paths in the higher layers of the kernel that directly * set the page dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; /* This page has ordered extent covering it already */ if (PageOrdered(page)) return 0; /* * PageChecked is set below when we create a fixup worker for this page, * don't try to create another one if we're already PageChecked() * * The extent_io writepage code will redirty the page if we send back * EAGAIN. */ if (PageChecked(page)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); if (!fixup) return -EAGAIN; /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the page lock, and we can't trust * page->mapping outside of the page lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want * to drop it from the cache until it is completely in the btree. * * So, tell btrfs_drop_extents to leave this extent in the cache. * the caller is expected to unpin it and allow it to be merged * with the others. */ drop_args.path = path; drop_args.start = file_pos; drop_args.end = file_pos + num_bytes; drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); write_extent_buffer(leaf, stack_fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { u64 inline_size = round_down(drop_args.bytes_found, sectorsize); inline_size = drop_args.bytes_found - inline_size; btrfs_update_inode_bytes(inode, sectorsize, inline_size); drop_args.bytes_found -= inline_size; num_bytes -= sectorsize; } if (update_inode_bytes) btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), file_pos - offset, qgroup_reserved, &ins); out: btrfs_free_path(path); return ret; } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); spin_lock(&cache->lock); cache->delalloc_bytes -= len; spin_unlock(&cache->lock); btrfs_put_block_group(cache); } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; bool update_inode_bytes; u64 num_bytes = oe->num_bytes; u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ /* * For delalloc, when completing an ordered extent we update the inode's * bytes when clearing the range in the inode's io tree, so pass false * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } /* * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } if (btrfs_is_zoned(fs_info)) btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; /* Truncated the entire extent, don't bother adding */ if (!logical_len) goto out; } if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } clear_bits |= EXTENT_LOCKED; lock_extent(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); if (ret) goto out; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } } if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } ret = unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } /* * If this is a new delalloc range, clear its new delalloc flag to * update the inode's number of bytes. This needs to be done first * before updating the inode item. */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { u64 unwritten_start = start; /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered * extent, and mark the inode with the error if it wasn't * already set. Any error during writeback would have already * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ if (ret) btrfs_mark_ordered_extent_error(ordered_extent); if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* * Drop extent maps for the part of the extent we didn't write. * * We have an exception here for the free_space_inode, this is * because when we do btrfs_get_extent() on the free space inode * we will search the commit root. If this is a new block group * we won't find anything, and we will trip over the assert in * writepage where we do ASSERT(em->block_start != * EXTENT_MAP_HOLE). * * Theoretically we could also skip this for any NOCOW extent as * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ if (!btrfs_is_free_space_inode(inode)) btrfs_drop_extent_map_range(inode, unwritten_start, end, false); /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. * * If we made it past insert_reserved_file_extent before we * errored out then we don't need to do this as the accounting * has already been done. */ if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { /* * Discard the range before returning it back to the * free space pool */ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } } /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); return ret; } int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. */ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; return 0; } /* * Verify the checksum of a single data sector. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) * @bv: bio_vec to check * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. * * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, csum_expected)) goto zeroit; return true; zeroit: btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); memzero_bvec(bv); return false; } /* * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * * This function uses the generic vfs_inode::i_count to track whether we should * just decrement it (in case it's > 1) or if this is the last iput then link * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq * context. */ spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { /* * btrfs_put_ordered_extent() can run in irq context (see bio.c), which * calls btrfs_add_delayed_iput() and that needs to lock * fs_info->delayed_iput_lock. So we need to disable irqs here to * prevent a deadlock. */ spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); if (need_resched()) { spin_unlock_irq(&fs_info->delayed_iput_lock); cond_resched(); spin_lock_irq(&fs_info->delayed_iput_lock); } } spin_unlock_irq(&fs_info->delayed_iput_lock); } /* * Wait for flushing all delayed iputs * * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. * * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { int ret = wait_event_killable(fs_info->delayed_iputs_wait, atomic_read(&fs_info->nr_delayed_iputs) == 0); if (ret) return -EINTR; return 0; } /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); return ret; } return 0; } /* * We have done the delete so we can go ahead and remove the orphan item for * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* * this cleans up any orphans that may be left on the list from the last use * of this root. */ int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* * if ret == 0 means we found what we were searching for, which * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; } /* pull out the item */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ btrfs_release_path(path); /* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item. */ if (found_key.offset == last_objectid) { /* * We found the same inode as before. This means we were * not able to remove its items via eviction triggered * by an iput(). A transaction abort may have happened, * due to -ENOSPC for example, so try to grab the error * that lead to a transaction abort, if any. */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } last_objectid = found_key.offset; found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; if (ret != -ENOENT) goto out; } if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; /* * This is an orphan in the tree root. Currently these * could come from 2 sources: * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode * We need to distinguish those two, as the orphan item * for a root must not get deleted before the deletion * of the snapshot/subvolume's tree completes. * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; continue; } } /* * If we have an inode with links, there are a couple of * possibilities: * * 1. We were halfway through creating fsverity metadata for the * file. In that case, the orphan item represents incomplete * fsverity metadata which must be cleaned up with * btrfs_drop_verity_items and deleting the orphan item. * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent * items, but the (useless) orphan item was still created. Since * v4.18, we don't create the orphan item for truncate at all. * * So, this item could mean that we need to do a truncate, but * only if this filesystem was last used on a pre-v3.12 kernel * and was not cleanly unmounted. The odds of that are quite * slim, and it's a pain to do the truncate now, so just delete * the orphan item. * * It's also possible that this orphan item was supposed to be * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ if (!inode || inode->i_nlink) { if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); inode = NULL; if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); if (ret) goto out; continue; } nr_unlink++; /* this will do delete_inode and everything for us */ iput(inode); } /* release the path since we're done with it */ btrfs_release_path(path); if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); } if (nr_unlink) btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } /* * very simple check to peek ahead in the leaf looking for xattrs. If we * don't find any xattrs, we know there can't be any acls. * * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int slot, u64 objectid, int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; static u64 xattr_access = 0; static u64 xattr_default = 0; int scanned = 0; if (!xattr_access) { xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, strlen(XATTR_NAME_POSIX_ACL_ACCESS)); xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); /* we found a different objectid, there must not be acls */ if (found_key.objectid != objectid) return 0; /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; } /* * we found a key greater than an xattr key, there can't * be any acls later on */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) return 0; slot++; scanned++; /* * it goes inode, inode backrefs, xattrs, extents, * so if there are a ton of hard links to an inode there can * be a lot of backrefs. Don't waste time searching too hard, * this is just an optimization */ if (scanned >= 8) break; } /* we hit the end of the leaf before we found an xattr or * something larger than an xattr. We have to assume the inode * has acls */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; return 1; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if (WARN_ON_ONCE(inode->file_extent_tree)) return 0; if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; if (!S_ISREG(inode->vfs_inode.i_mode)) return 0; if (btrfs_is_free_space_inode(inode)) return 0; inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); if (!inode->file_extent_tree) return -ENOMEM; extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); return 0; } /* * read an inode from the btree into the in-memory inode */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; int first_xattr_slot; ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) return ret; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; if (!path) { path = btrfs_alloc_path(); if (!path) return -ENOMEM; } btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { if (path != in_path) btrfs_free_path(path); return ret; } leaf = path->nodes[0]; if (filled) goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); inode_set_iversion_queried(inode, btrfs_inode_sequence(leaf, inode_item)); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); if (S_ISDIR(inode->i_mode)) BTRFS_I(inode)->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. * * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation * against the inode was last made. So here we assume the inode might * have been evicted, and therefore the exact value of last_unlink_trans * lost, and set it to last_trans to avoid metadata inconsistencies * between the inode and its parent if the inode is fsync'ed and the log * replayed. For example, in the scenario: * * touch mydir/foo * ln mydir/foo mydir/bar * sync * unlink mydir/bar * echo 2 > /proc/sys/vm/drop_caches # evicts inode * xfs_io -c fsync mydir/foo * <power failure> * mount fs, triggers fsync log replay * * We must make sure that when we fsync our inode foo we also log its * parent inode, otherwise after log replay the parent still has the * dentry with the "bar" name but our inode foo has a link count of 1 * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation * of the last transaction where this inode was used for a reflink * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); if (location.objectid != btrfs_ino(BTRFS_I(inode))) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); if (location.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } if (path != in_path) btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: inode->i_fop = &btrfs_dir_file_operations; inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); return 0; } /* * given a leaf and an inode, copy the inode fields into the leaf */ static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode) { struct btrfs_map_token token; u64 flags; btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); btrfs_set_token_inode_mode(&token, item, inode->i_mode); btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, inode_get_ctime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, BTRFS_I(inode)->generation); btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } /* * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; btrfs_get_inode_key(inode, &key); ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto failed; } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: btrfs_free_path(path); return ret; } /* * copy everything in the in-memory inode into the btree. */ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; /* * If the inode is a free space inode, we can deadlock during commit * if we put it into the delayed code. * * The data relocation inode should also be directly updated * without delay */ if (!btrfs_is_free_space_inode(inode) && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, inode); return ret; } /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; struct btrfs_dir_item *di; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; } ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; btrfs_release_path(path); /* * If we don't have dir index, we have to get it by looking up * the inode ref, since we get the inode ref, remove it directly, * it is unnecessary to do delayed deletion. * * But if we have dir index, needn't search inode ref to get it. * Since the inode ref is close to the inode item, it is better * that we delay to delete it, and just do this deletion when * we update the inode item. */ if (inode->dir_index) { ret = btrfs_delayed_delete_inode_ref(inode); if (!ret) { index = inode->dir_index; goto skip_backref; } } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } skip_backref: if (rename_ctx) rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* * If we have a pending delayed iput we could end up with the final iput * being run in btrfs-cleaner context. If we have enough of these built * up we can end up burning a lot of time in btrfs-cleaner without any * way to throttle the unlinks. Since we're currently holding a ref on * the inode we can run the delayed iput here without any issues as the * final iput won't be done until after we drop the ref we're currently * holding. */ btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); out: return ret; } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { int ret; ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode); } return ret; } /* * helper to start transaction for unlink and rmdir. * * unlink and rmdir are special in btrfs, they do not always free space, so * if we cannot make our reservations the normal way try and see if there is * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { struct btrfs_root *root = dir->root; return btrfs_start_transaction_fallback_global_rsv(root, BTRFS_UNLINK_METADATA_UNITS); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; struct fscrypt_name fname; ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fscrypt_free; } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), false); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); if (ret) goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto end_trans; } end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); fscrypt_free: fscrypt_free_filename(&fname); return ret; } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; u64 index; int ret; u64 objectid; u64 dir_ino = btrfs_ino(dir); struct fscrypt_name fname; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); /* * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; else ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); fscrypt_free_filename(&fname); return ret; } /* * Helper to check if the subvolume references other subvolumes or if it's * default. */ static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); } key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ ret = -EUCLEAN; goto out; } ret = 0; if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: btrfs_free_path(path); return ret; } /* Delete all dentries for inodes belonging to the root */ static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); inode = btrfs_find_first_inode(root, min_ino); while (inode) { if (atomic_read(&inode->vfs_inode.i_count) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; /* * btrfs_drop_inode() will have it removed from the inode * cache when its usage count hits zero. */ iput(&inode->vfs_inode); cond_resched(); inode = btrfs_find_first_inode(root, min_ino); } } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit * again is not run concurrently. */ spin_lock(&dest->root_item_lock); if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); ret = may_destroy_subvol(dest); if (ret) goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * One for dir inode, * two for dir entries, * two for root ref/backref. */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); btrfs_set_root_drop_level(&dest->root_item, 0); btrfs_set_root_refs(&dest->root_item, 0); if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } free_anon_bdev(dest->anon_dev); dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); if (qgroup_reserved) btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } out_up_write: up_write(&fs_info->subvol_sem); if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); } return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); if (!ret) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to * its parent directory. This is to prevent an unrecoverable * log tree in the case we do something like this: * 1) create dir foo * 2) create snapshot under dir foo * 3) delete the snapshot * 4) rmdir foo * 5) mkdir foo * 6) fsync foo or some file inside foo */ if (last_unlink_trans >= trans->transid) BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; } out: btrfs_end_transaction(trans); out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); return ret; } /* * Read, zero a chunk and write a block. * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the * offset * @front - zero up to the offset instead of from the offset on * * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; if (IS_ALIGNED(offset, blocksize) && (!len || IS_ALIGNED(len, blocksize))) goto out; block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, block_start, blocksize); goto out; } again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } if (!folio_test_uptodate(folio)) { ret = btrfs_read_folio(NULL, folio); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); folio_put(folio); goto again; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto out_unlock; } } /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; folio_wait_writeback(folio); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } if (offset != blocksize) { if (!len) len = blocksize - offset; if (front) folio_zero_range(folio, block_start - folio_pos(folio), offset); else folio_zero_range(folio, (block_start - folio_pos(folio)) + offset, len); } btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_NORESERVE, NULL); out_unlock: if (ret) { if (only_release_metadata) btrfs_delalloc_release_metadata(inode, blocksize, true); else btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); folio_unlock(folio); folio_put(folio); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret; } static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* * If NO_HOLES is enabled, we don't need to do anything. * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() * or btrfs_update_inode() will be called, which guarantee that the next * fsync will know this inode was changed and needs to be logged. */ if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; /* * 1 - for the one we're dropping * 1 - for the one we're adding * 1 - for updating the inode. */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans); drop_args.start = offset; drop_args.end = offset + len; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); if (ret) { btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; } /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size */ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ ret = btrfs_truncate_block(inode, oldsize, 0, 0); if (ret) return ret; if (size <= hole_start) return 0; btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; break; } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; ret = maybe_insert_hole(inode, cur_offset, hole_size); if (ret) break; ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (ret) break; hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, false); btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; hole_em->len = hole_size; hole_em->disk_bytenr = EXTENT_MAP_HOLE; hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (ret) break; } next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); loff_t newsize = attr->ia_size; int mask = attr->ia_valid; int ret; /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } } if (newsize > oldsize) { /* * Don't do an expanding truncate while snapshotting is ongoing. * This is to ensure the snapshot captures a fully consistent * state of this file - if the snapshot captures this expanding * truncation, it must capture all writes that happened before * this truncation. */ btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) { btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) return ret; } /* * We're truncating a file that used to have good data down to * zero. Make sure any new writes to the file get on disk * on close. */ if (newsize == 0) set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); inode_dio_wait(inode); ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; /* * Truncate failed, so fix up the in-memory size. We * adjusted disk_i_size down as we removed extents, so * wait for disk_i_size to be stable and then update the * in-memory size to match. */ err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } return ret; } static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; if (btrfs_root_readonly(root)) return -EROFS; err = setattr_prepare(idmap, dentry, attr); if (err) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { err = btrfs_setsize(inode, attr); if (err) return err; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(idmap, dentry, inode->i_mode); } return err; } /* * While truncating the inode pages during eviction, we get the VFS * calling btrfs_invalidate_folio() against each folio of the inode. This * is slow because the calls to btrfs_invalidate_folio() result in a * huge amount of calls to lock_extent() and clear_extent_bit(), * which keep merging and splitting extent_state structures over and over, * wasting lots of time. * * Therefore if the inode is being evicted, let btrfs_invalidate_folio() * skip all those expensive operations on a per folio basis and do only * the ordered io finishing, while we release here the extent_map and * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readahead that have * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken * (which would be dropped in the end io callback of each bio). * Therefore here we effectively end up waiting for those bios and * anyone else holding locked ranges without having bumped the inode's * reference count - if we don't do it, when they access the inode's * io_tree to unlock a range it may be too late, leading to an * use-after-free issue. */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; u64 start; u64 end; unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, &cached_state); cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); int ret; /* * Eviction should be taking place at some place safe because of our * delayed iputs. However the normal flushing code will run delayed * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. * * We reserve the delayed_refs_extra here again because we can't use * btrfs_start_transaction(root, 0) for the same deadlocky reason as * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, * if we fail to make this reservation we can re-try without the * delayed_refs_extra so we can make some forward progress. */ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); } delayed_refs_extra = 0; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return trans; if (delayed_refs_extra) { trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = delayed_refs_extra; btrfs_block_rsv_migrate(rsv, trans->block_rsv, delayed_refs_extra, true); } return trans; } void btrfs_evict_inode(struct inode *inode) { struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; int ret; trace_btrfs_inode_evict(inode); if (!root) { fsverity_cleanup_inode(inode); clear_inode(inode); return; } fs_info = inode_to_fs_info(inode); evict_inode_truncate_pages(inode); if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; if (is_bad_inode(inode)) goto out; if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto out; if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } /* * This makes sure the inode item in tree is uptodate and the space for * the inode update is released. */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto out; /* * This drops any pending insert or delete operations we have for this * inode. We could have a delayed dir index deletion queued up, but * we're removing the inode completely so that'll be taken care of in * the truncate. */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto out; rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { struct btrfs_truncate_control control = { .inode = BTRFS_I(inode), .ino = btrfs_ino(BTRFS_I(inode)), .new_size = 0, .min_type = 0, }; trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto out; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); /* * We have not added new delayed items for our inode after we * have flushed its delayed items, so no need to throttle on * delayed items. However we have modified extent buffers. */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) goto out; else if (!ret) break; } /* * Errors here aren't a big deal, it just means we leave orphan items in * the tree. They will be cleaned up on the next mount. If the inode * number gets reused, cleanup deletes the orphan item without doing * anything, and unlink reuses the existing orphan item. * * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } out: btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); fsverity_cleanup_inode(inode); clear_inode(inode); } /* * Return the key found in the dir entry in the location pointer, fill @type * with BTRFS_FT_*, and return 0. * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) goto out; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. */ ASSERT(ret == 0); /* This needs to handle no-key deletions later on */ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); if (location->type != BTRFS_INODE_ITEM_KEY && location->type != BTRFS_ROOT_ITEM_KEY) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } /* * when we hit a tree root in a directory, the btrfs part of the inode * needs to be changed to reflect the root directory of the tree root. This * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) { struct btrfs_path *path; struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; int ret; int err = 0; struct fscrypt_name fname; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); if (ret) return ret; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } err = -ENOENT; key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret) { if (ret < 0) err = ret; goto out; } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; ret = memcmp_extent_buffer(leaf, fname.disk_name.name, (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; btrfs_release_path(path); new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; } *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; location->offset = 0; err = 0; out: btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) { struct btrfs_root *root = inode->root; struct btrfs_inode *existing; const u64 ino = btrfs_ino(inode); int ret; if (inode_unhashed(&inode->vfs_inode)) return 0; if (prealloc) { ret = xa_reserve(&root->inodes, ino, GFP_NOFS); if (ret) return ret; } existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); if (xa_is_err(existing)) { ret = xa_err(existing); ASSERT(ret != -EINVAL); ASSERT(ret != -ENOMEM); return ret; } else if (existing) { WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); } return 0; } static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_inode *entry; bool empty = false; xa_lock(&root->inodes); entry = __xa_erase(&root->inodes, btrfs_ino(inode)); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { xa_lock(&root->inodes); empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } } static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) set_bit(BTRFS_INODE_FREE_SPACE_INODE, &BTRFS_I(inode)->runtime_flags); return 0; } static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; unsigned long hashval = btrfs_inode_hash(ino, root); args.ino = ino; args.root = root; inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; } /* * Get an inode object given its inode number and corresponding root. * Path can be preallocated to prevent recursing back to iget through * allocator. NULL is also valid but may require an additional allocation * later. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_read_locked_inode(), this means the inode item was not found. */ if (ret > 0) ret = -ENOENT; if (ret < 0) goto error; ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); if (ret < 0) goto error; unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(ret); } struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { return btrfs_iget_path(ino, root, NULL); } static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { struct timespec64 ts; struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->ref_root_id = key->objectid; set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; ts = inode_set_ctime_current(inode); inode_set_mtime_to_ts(inode, ts); inode_set_atime_to_ts(inode, inode_get_atime(dir)); BTRFS_I(inode)->i_otime_sec = ts.tv_sec; BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; inode->i_uid = dir->i_uid; inode->i_gid = dir->i_gid; return inode; } static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); static_assert(BTRFS_FT_DIR == FT_DIR); static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); static inline u8 btrfs_inode_type(struct inode *inode) { return fs_umode_to_ftype(inode->i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) return inode; /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->i_mode, btrfs_inode_type(inode), di_type); iput(inode); return ERR_PTR(-EUCLEAN); } return inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) inode = ERR_PTR(ret); else inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) return inode; down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { iput(inode); inode = ERR_PTR(ret); } } return inode; } static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return 1; } return 0; } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = btrfs_lookup_dentry(dir, dentry); if (inode == ERR_PTR(-ENOENT)) inode = NULL; return d_splice_alias(inode, dentry); } /* * Find the highest existing sequence number in a directory and then set the * in-memory index_cnt variable to the first free sequence number. */ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; struct btrfs_path *path; struct extent_buffer *leaf; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* FIXME: we should be able to handle this */ if (ret == 0) goto out; ret = 0; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } inode->index_cnt = found_key.offset + 1; out: btrfs_free_path(path); return ret; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; btrfs_inode_lock(dir, 0); if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) goto out; } } /* index_cnt is the index number of next new entry, so decrement it. */ *index = dir->index_cnt - 1; out: btrfs_inode_unlock(dir, 0); return ret; } /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is * similar to what NFS does, only we don't keep the buffer around in pagecache * because I'm afraid I'll mess that up. Long term we need to make filldir do * copy_to_user_inatomic so we don't have to worry about page faulting under the * tree lock. */ static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; u64 last_index; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); if (ret) return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); return -ENOMEM; } file->private_data = private; return 0; } static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) { struct btrfs_file_private *private = file->private_data; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), &private->last_index); if (ret) return ret; return generic_file_llseek(file, offset, whence); } struct dir_entry { u64 ino; u64 offset; unsigned type; int name_len; }; static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) { while (entries--) { struct dir_entry *entry = addr; char *name = (char *)(entry + 1); ctx->pos = get_unaligned(&entry->offset); if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), get_unaligned(&entry->ino), get_unaligned(&entry->type))) return 1; addr += sizeof(struct dir_entry) + get_unaligned(&entry->name_len); ctx->pos++; } return 0; } static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; int entries = 0; int total_len = 0; bool put = false; struct btrfs_key location; if (!dir_emit_dots(file, ctx)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; addr = private->filldir_buf; path->reada = READA_FORWARD; put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; u8 ftype; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) continue; if (found_key.offset > private->last_index) break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) continue; di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; addr = private->filldir_buf; entries = 0; total_len = 0; goto again; } ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); put_unaligned(name_len, &entry->name_len); put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; } /* Catch error encountered during iteration */ if (ret < 0) goto err; btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; /* * Stop new entries from being returned after we return the last * entry. * * New directory entries are assigned a strictly increasing * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as * they're returned by readdir. Until we re-use freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. * * This is being careful not to overflow 32bit loff_t unless the * last entry requires it because doing so has broken 32bit apps * in the past. */ if (ctx->pos >= INT_MAX) ctx->pos = LLONG_MAX; else ctx->pos = INT_MAX; nopos: ret = 0; err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); btrfs_free_path(path); return ret; } /* * This is somewhat expensive, updating the tree every time the * inode changes. But, it is most likely to find the inode in cache. * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ static int btrfs_dirty_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; } /* * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty; if (btrfs_root_readonly(root)) return -EROFS; dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) return ret; } } *index = dir->index_cnt; dir->index_cnt++; return ret; } static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), btrfs_find_actor, &args); } int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items) { struct inode *dir = args->dir; struct inode *inode = args->inode; int ret; if (!args->orphan) { ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, &args->fname); if (ret) return ret; } ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); if (ret) { fscrypt_free_filename(&args->fname); return ret; } /* 1 to add inode item */ *trans_num_items = 1; /* 1 to add compression property */ if (BTRFS_I(dir)->prop_compress) (*trans_num_items)++; /* 1 to add default ACL xattr */ if (args->default_acl) (*trans_num_items)++; /* 1 to add access ACL xattr */ if (args->acl) (*trans_num_items)++; #ifdef CONFIG_SECURITY /* 1 to add LSM xattr */ if (dir->i_security) (*trans_num_items)++; #endif if (args->orphan) { /* 1 to add orphan item */ (*trans_num_items)++; } else { /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item * * No need for 1 unit for the inode ref item because it is * inserted in a batch together with the inode item at * btrfs_create_new_inode(). */ *trans_num_items += 3; } return 0; } void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); fscrypt_free_filename(&args->fname); } /* * Inherit flags from the parent inode. * * Currently only the compression flags and the cow flags are inherited. */ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { inode->flags &= ~BTRFS_INODE_COMPRESS; inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { inode->flags &= ~BTRFS_INODE_NOCOMPRESS; inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { inode->flags |= BTRFS_INODE_NODATACOW; if (S_ISREG(inode->vfs_inode.i_mode)) inode->flags |= BTRFS_INODE_NODATASUM; } btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; int ret; bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (!args->subvol) BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) goto out; ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; btrfs_set_inode_number(BTRFS_I(inode), objectid); ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); if (ret) goto out; xa_reserved = true; if (args->orphan) { /* * O_TMPFILE, set link count to 0, so that after this point, we * fill in an inode item with the correct link count. */ set_nlink(inode, 0); } else { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); if (ret) goto out; } if (S_ISDIR(inode->i_mode)) BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* * We don't have any capability xattrs set here yet, shortcut any * queries for the xattrs here. If we add them later via the inode * security init path or any other path this flag will be cleared. */ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. */ if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) BTRFS_I(dir)->index_cnt--; goto out; } /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will * be packed into one item. Extended refs will kick in if we * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; if (args->subvol) { key[1].offset = objectid; sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); sizes[1] = name->len + sizeof(*ref); } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret != 0) { btrfs_abort_transaction(trans, ret); goto discard; } ts = simple_inode_init_ts(inode); BTRFS_I(inode)->i_otime_sec = ts.tv_sec; BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode * must be fully initialized. */ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); ptr = (unsigned long)(ref + 1); if (args->subvol) { btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); write_extent_buffer(path->nodes[0], name->name, ptr, name->len); } } btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in * allocating yet another path. So just free our path. */ btrfs_free_path(path); path = NULL; if (args->subvol) { struct inode *parent; /* * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { ret = btrfs_inode_inherit_props(trans, inode, parent); iput(parent); } } else { ret = btrfs_inode_inherit_props(trans, inode, dir); } if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* * Subvolumes don't inherit ACLs or get passed to the LSM. This is * probably a bug. */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); if (ret) { btrfs_abort_transaction(trans, ret); goto discard; } } ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); if (WARN_ON(ret)) { /* Shouldn't happen, we used xa_reserve() before. */ btrfs_abort_transaction(trans, ret); goto discard; } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); goto discard; } return 0; discard: /* * discard_new_inode() calls iput(), but the caller owns the reference * to the inode. */ ihold(inode); discard_new_inode(inode); out: if (xa_reserved) xa_release(&root->inodes, objectid); btrfs_free_path(path); return ret; } /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. * if 'add_backref' is true, also insert a backref from the * inode to the parent directory. */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; struct btrfs_root *root = parent_inode->root; u64 ino = btrfs_ino(inode); u64 parent_ino = btrfs_ino(parent_inode); if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { memcpy(&key, &inode->root->root_key, sizeof(key)); } else { key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, ret); return ret; } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime * and ctime of the parent directory with the current time, since the * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, &local_index); if (err) btrfs_abort_transaction(trans, err); } /* Return the original error code */ return ret; } static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = dentry, .inode = inode, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; int err; err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_new_inode_args; } err = btrfs_create_new_inode(trans, &new_inode_args); if (!err) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (err) iput(inode); return err; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); return btrfs_create_common(dir, dentry, inode); } static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); if (err) goto fail; err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; /* * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode * 1 item for orphan item deletion if O_TMPFILE */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; goto fail; } /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); if (err) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) goto fail; } d_instantiate(dentry, inode); btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, struct btrfs_file_extent_item *item) { int ret; struct extent_buffer *leaf = path->nodes[0]; char *tmp; size_t max_size; unsigned long inline_size; unsigned long ptr; int compress_type; compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; ptr = btrfs_file_extent_inline_start(item); read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end * of the uncompressed data and the end of max_size in case the decompressed * data ends up shorter than ram_bytes. That doesn't cover the hole between * the end of an inline extent and the beginning of the next block, so we * cover that region here. */ if (max_size < PAGE_SIZE) memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct page *page) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; if (!page || PageUptodate(page)) return 0; ASSERT(page_offset(page) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, page, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_page(page); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) memzero_page(page, copy_size, PAGE_SIZE - copy_size); return 0; } /* * Lookup the first extent overlapping a range in a file. * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @start: file offset * @len: length of range starting at @start * * Return the first &struct extent_map which overlaps the given range, reading * it from the B-tree and caching it if necessary. Note that there may be more * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. * * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) free_extent_map(em); else goto out; } em = alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; /* * The same explanation in load_free_space_cache applies here as well, * we only read when we're loading the free space cache, and at that * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; ret = 0; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll * say there is a hole for our whole search range which can * cause problems. */ extent_end = start; goto next; } extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); goto out; } trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); } next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) goto not_found; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) goto not_found; if (start + len <= found_key.offset) goto not_found; if (start > found_key.offset) goto next; /* New extent overlaps with existing one */ em->start = start; em->len = found_key.offset - start; em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * Inline extent can only exist at file offset 0. This is * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ ASSERT(extent_start == 0); ASSERT(em->start == 0); /* * btrfs_extent_item_to_extent_map() should have properly * initialized em members already. * * Other members are not utilized for inline extents. */ ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); ret = read_inline_extent(inode, path, page); if (ret < 0) goto out; goto insert; } not_found: em->start = start; em->len = len; em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); ret = -EIO; goto out; } write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); if (ret) { free_extent_map(em); return ERR_PTR(ret); } return em; } static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; bool readonly = false; block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro) readonly = true; if (block_group) btrfs_put_block_group(block_group); return readonly; } /* * Check if we can do nocow write into the range [@offset, @offset + @len) * * @offset: File offset * @len: The length to write, will be updated to the nocow writeable * range * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write * <0 if error happened * * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait, bool strict) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->nowait = nowait; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); if (ret < 0) goto out; if (ret == 1) { if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; } if (key.offset > offset) { /* Wrong offset, must cow */ goto out; } if (btrfs_file_extent_end(path) <= offset) goto out; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ ret = 0; goto out; } ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; } } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } /* The callers of this must take lock_extent() */ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, const struct btrfs_file_extent *file_extent, int type) { struct extent_map *em; int ret; /* * Note the missing NOCOW type. * * For pure NOCOW writes, we should not create an io extent map, but * just reusing the existing one. * Only PREALLOC writes (NOCOW write into preallocated range) can * create an io extent map. */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || type == BTRFS_ORDERED_REGULAR); switch (type) { case BTRFS_ORDERED_PREALLOC: /* We're only referring part of a larger preallocated extent. */ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break; case BTRFS_ORDERED_REGULAR: /* COW results a new extent matching our file extent size. */ ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); ASSERT(file_extent->ram_bytes == file_extent->num_bytes); /* Since it's a new extent, we should not have any offset. */ ASSERT(file_extent->offset == 0); break; case BTRFS_ORDERED_COMPRESSED: /* Must be compressed. */ ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); /* * Encoded write can make us to refer to part of the * uncompressed extent. */ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break; } em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; em->len = file_extent->num_bytes; em->disk_bytenr = file_extent->disk_bytenr; em->disk_num_bytes = file_extent->disk_num_bytes; em->ram_bytes = file_extent->ram_bytes; em->generation = -1; em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { free_extent_map(em); return ERR_PTR(ret); } /* em got 2 refs now, callers needs to do free_extent_map once. */ return em; } /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = page_to_fs_info(page); struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; if (!btrfs_is_subpage(fs_info, page->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, * without doing anything. But we just want to make sure no one is * still holding the subpage spinlock. * And since the page is not dirty nor writeback, and we have page * locked, the only possible way to hold a spinlock is from the endio * function to clear page writeback. * * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ spin_lock_irq(&subpage->lock); spin_unlock_irq(&subpage->lock); } static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), PAGE_SIZE, NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { wait_subpage_spinlock(&folio->page); clear_page_extent_mapped(&folio->page); return true; } return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; return __btrfs_release_folio(folio, gfp_flags); } #ifdef CONFIG_MIGRATION static int btrfs_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { int ret = filemap_migrate_folio(mapping, dst, src, mode); if (ret != MIGRATEPAGE_SUCCESS) return ret; if (folio_test_ordered(src)) { folio_clear_ordered(src); folio_set_ordered(dst); } return MIGRATEPAGE_SUCCESS; } #else #define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct btrfs_inode *inode = folio_to_inode(folio); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this * page, nor bio can be submitted for this folio. * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * * So here we wait for any submitted bios to finish, so that we won't * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. * If the range doesn't cover the full folio, we don't need to and * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * * For cases that invalidate the full folio even the range doesn't * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ if (!(offset == 0 && length == folio_size(folio))) { btrfs_release_folio(folio, GFP_NOFS); return; } if (!inode_evicting) lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; u64 range_end; u32 range_len; u32 extra_flags = 0; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); if (!ordered) { range_end = page_end; /* * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } if (ordered->file_offset > cur) { /* * There is a range between [cur, oe->file_offset) not * covered by any ordered extent. * We are safe to delete all extent states, and handle * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } range_end = min(ordered->file_offset + ordered->num_bytes - 1, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ goto next; } btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. * * This will also unlock the range for incoming * btrfs_finish_ordered_io(). */ if (!inode_evicting) clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all * the extent states of the range, otherwise * btrfs_finish_ordered_io() will get executed by endio for * other pages, so we can't delete extent states. */ if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ extra_flags = EXTENT_CLEAR_ALL_BITS; } next: if (ordered) btrfs_put_ordered_extent(ordered); /* * Qgroup reserved space handler * Sector(s) here will be either: * * 1) Already written to disk or bio already finished * Then its QGROUP_RESERVED bit in io_tree is already cleared. * Qgroup will be handled by its qgroup_record then. * btrfs_qgroup_free_data() call will do nothing here. * * 2) Not written to disk yet * Then btrfs_qgroup_free_data() call will clear the * QGROUP_RESERVED bit of its io_tree, and free the qgroup * reserved data space. * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | extra_flags, &cached_state); } cur = range_end + 1; } /* * We have iterated through all ordered extents of the page, the page * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { .inode = inode, .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; } /* * Yes ladies and gentlemen, this is indeed ugly. We have a couple of * things going on here: * * 1) We need to reserve space to update our inode. * * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode. We also * need to be able to stop the transaction and start a new one, which * means we need to be able to update the inode several times, and we * have no idea of knowing how many times that will be, so we can't just * reserve 1 item for the entirety of the operation, so that has to be * done separately as well. * * So that leaves us with * * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; rsv->failfast = true; /* * 1 for the truncate slack space * 1 for updating the inode. */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and * min_size matches 1 unit, so this should never fail, but if it does, * it's not critical we just fail truncation. */ if (WARN_ON(ret)) { btrfs_end_transaction(trans); goto out; } trans->block_rsv = rsv; while (1) { struct extent_state *cached_state = NULL; const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ btrfs_drop_extent_map_range(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; ret = btrfs_update_inode(trans, inode); if (ret) break; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; break; } btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never * fail, but if it does, it's not critical we just fail truncation. */ if (WARN_ON(ret)) break; trans->block_rsv = rsv; } /* * We can't call btrfs_truncate_block inside a trans handle as we could * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we * know we've truncated everything except the last little bit, and can * do btrfs_truncate_block and then update the disk_i_size. */ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; ret2 = btrfs_end_transaction(trans); if (ret2 && !ret) ret = ret2; btrfs_btree_balance_dirty(fs_info); } out: btrfs_free_block_rsv(fs_info, rsv); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to * first truncate that entire inode. So set this flag so we write out * all of the extents in the inode to the sync log so we're completely * safe. * * If no extents were dropped or trimmed we don't need to force the next * fsync to truncate all the inode's items from the log and re-log them * all. This means the truncate operation did not change the file size, * or changed it to a smaller size but there was only an implicit hole * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ if (control.extents_found > 0) btrfs_set_inode_full_sync(inode); return ret; } struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { /* * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug. */ inode_init_owner(idmap, inode, NULL, S_IFDIR | (~current_umask() & S_IRWXUGO)); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; } return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->root = NULL; ei->generation = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; /* * ->index_cnt will be properly initialized later when creating a new * inode (btrfs_create_new_inode()) or when reading an existing inode * from disk (btrfs_read_locked_inode()). */ ei->csum_bytes = 0; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; ei->i_otime_sec = 0; ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); init_rwsem(&ei->i_mmap_lock); return inode; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } void btrfs_destroy_inode(struct inode *vfs_inode) { struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; bool freespace_inode; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); WARN_ON(inode->csum_bytes); } if (!root || !btrfs_is_data_reloc_root(root)) WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also * created the same inode and we need to destroy the one we already * created. */ if (!root) return; /* * If this is a free space inode do not take the ordered extents lockdep * map. */ freespace_inode = btrfs_is_free_space_inode(inode); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); if (!freespace_inode) btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } btrfs_qgroup_check_reserved_leak(inode); btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; if (root == NULL) return 1; /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0) return 1; else return generic_drop_inode(inode); } static void init_once(void *foo) { struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } void __cold btrfs_destroy_cachep(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); } int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) return -ENOMEM; return 0; } static int btrfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) stat->attributes |= STATX_ATTR_COMPRESSED; if (bi_flags & BTRFS_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; if (bi_ro_flags & BTRFS_INODE_RO_VERITY) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->subvol = BTRFS_I(inode)->root->root_key.objectid; stat->result_mask |= STATX_SUBVOL; spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; return 0; } static int btrfs_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; bool need_abort = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the * same inode namespace. Two subvolumes (represented as directory) can * be exchanged as they're a logical link and have a fixed inode number. */ if (root != dest && (old_ino != BTRFS_FIRST_FREE_OBJECTID || new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret; ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) { fscrypt_free_filename(&old_fname); return ret; } old_name = &old_fname.disk_name; new_name = &new_fname.disk_name; /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* * For each inode: * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index * 1 to update parent inode * * If the parents are the same, we only need to account for one */ trans_num_items = (old_dir == new_dir ? 9 : 10); if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref */ trans_num_items += 4; } else { /* * 1 to update inode item * 1 to remove old inode ref * 1 to add new inode ref */ trans_num_items += 3; } if (new_ino == BTRFS_FIRST_FREE_OBJECTID) trans_num_items += 4; else trans_num_items += 3; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. */ ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail; ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; BTRFS_I(new_inode)->dir_index = 0ULL; /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) goto out_fail; need_abort = true; } /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { if (need_abort) btrfs_abort_transaction(trans, ret); goto out_fail; } } /* Update inode version and ctime/mtime. */ inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), BTRFS_I(new_inode), true); } /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = old_idx; if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; /* * Now pin the logs of the roots. We do it to ensure that no other task * can sync the logs while we are in progress with the rename, because * that could result in an inconsistency in case any of the inodes that * are part of this rename operation were logged before. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_pin_log_trans(root); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_pin_log_trans(dest); /* Do the log updates for all inodes. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); /* Now unpin the logs. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(root); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); fscrypt_free_filename(&new_fname); fscrypt_free_filename(&old_fname); return ret; } static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { inode_init_owner(idmap, inode, dir, S_IFCHR | WHITEOUT_MODE); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } return inode; } static int btrfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) return -ENOTEMPTY; if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret; ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) { fscrypt_free_filename(&old_fname); return ret; } /* check for collisions, even if the name isn't there */ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ goto out_fscrypt_names; } } ret = 0; /* * we're using rename to replace one file with another. Start IO on it * now so we don't add too much work to the end of the transaction */ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); if (flags & RENAME_WHITEOUT) { whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) { ret = -ENOMEM; goto out_fscrypt_names; } ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); if (ret) goto out_whiteout_inode; } else { /* 1 to update the old parent inode. */ trans_num_items = 1; } if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref */ trans_num_items += 4; } else { /* * 1 to update inode * 1 to remove old inode ref * 1 to add new inode ref */ trans_num_items += 3; } /* * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index */ trans_num_items += 4; /* 1 to update new parent inode if it's not the same as the old parent */ if (new_dir != old_dir) trans_num_items++; if (new_inode) { /* * 1 to update inode * 1 to remove inode ref * 1 to remove dir item * 1 to remove dir index * 1 to possibly add orphan item */ trans_num_items += 5; } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; } inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (new_inode) { inode_inc_iversion(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { unlock_new_inode(whiteout_args.inode); iput(whiteout_args.inode); whiteout_args.inode = NULL; } } out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); if (flags & RENAME_WHITEOUT) btrfs_new_inode_args_destroy(&whiteout_args); out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); out_fscrypt_names: fscrypt_free_filename(&old_fname); fscrypt_free_filename(&new_fname); return ret; } static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int ret; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); else ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); return ret; } struct btrfs_delalloc_work { struct inode *inode; struct completion completion; struct list_head list; struct btrfs_work work; }; static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; struct inode *inode; delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); iput(inode); complete(&delalloc_work->completion); } static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) { struct btrfs_delalloc_work *work; work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); list_move_tail(&binode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) continue; inode = igrab(&binode->vfs_inode); if (!inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &binode->runtime_flags); if (full_flush) { work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); ret = -ENOMEM; goto out; } list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); wait_for_completion(&work->completion); kfree(work); } if (!list_empty(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } mutex_unlock(&root->delalloc_mutex); return ret; } int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_root *root; LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. */ if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); btrfs_put_root(root); if (ret < 0 || wbc.nr_to_write <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); ret = 0; out: if (!list_empty(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = dentry, }; unsigned int trans_num_items; int err; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; btrfs_i_size_write(BTRFS_I(inode), name_len); inode_set_bytes(inode, name_len); new_inode_args.inode = inode; err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_new_inode_args; } err = btrfs_create_new_inode(trans, &new_inode_args); if (err) goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; btrfs_abort_transaction(trans, err); discard_new_inode(inode); inode = NULL; goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { btrfs_abort_transaction(trans, err); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_compression(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); err = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (err) iput(inode); return err; } static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_trans_handle *trans_in, struct btrfs_inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; struct btrfs_replace_extent_info extent_info; struct btrfs_trans_handle *trans = trans_in; struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); if (ret < 0) return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, true, qgroup_released); if (ret) goto free_qgroup; return trans; } extent_info.disk_offset = start; extent_info.disk_len = len; extent_info.data_offset = 0; extent_info.data_len = len; extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto free_qgroup; } ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); if (ret) goto free_qgroup; return trans; free_qgroup: /* * We have released qgroup data range at the beginning of the function, * and normally qgroup_released bytes will be freed when committing * transaction. * But if we error out early, we have to free what we have released * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; u64 clear_offset = start; u64 i_size; u64 cur_bytes; u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; u64 end = start + num_bytes - 1; if (trans) own_trans = false; while (num_bytes > 0) { cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really * small allocations, so if the allocator is returning small * chunks lets make its job easier by only searching for those * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) break; /* * We've reserved this space, and thus converted it from * ->bytes_may_use to ->bytes_reserved. Any error that happens * from here on out we will only need to clear our reservation * for the remaining unreserved area, so advance our * clear_offset by our extent size. */ clear_offset += ins.offset; last_alloc = ins.offset; trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), &ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. * If we did it before, we could race with relocation and have * relocation miss the reserved extent, making it fail later. */ btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); break; } em = alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } em->start = cur_offset; em->len = ins.offset; em->disk_bytenr = ins.objectid; em->offset = 0; em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); break; } if (own_trans) { btrfs_end_transaction(trans); trans = NULL; } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, NULL); } int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, trans); } static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; if (mask & MAY_WRITE && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { if (btrfs_root_readonly(root)) return -EROFS; if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } return generic_permission(idmap, inode, mask); } static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; int ret; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; new_inode_args.inode = inode; ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_new_inode_args; } ret = btrfs_create_new_inode(trans, &new_inode_args); /* * We set number of links to 0 in btrfs_create_new_inode(), and here we * set it to 1 because d_tmpfile() will issue a warning if the count is * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); if (!ret) { d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (ret) iput(inode); return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ /* This is for data, which doesn't yet support larger folio. */ ASSERT(folio_order(page_folio(page)) == 0); btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); put_page(page); index++; } } int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: return BTRFS_ENCODED_IO_COMPRESSION_NONE; case BTRFS_COMPRESS_ZLIB: return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; case BTRFS_COMPRESS_LZO: /* * The LZO format depends on the sector size. 64K is the maximum * sector size that we support. */ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) return -EINVAL; return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (fs_info->sectorsize_bits - 12); case BTRFS_COMPRESS_ZSTD: return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; default: return -EUCLEAN; } } static ssize_t btrfs_encoded_read_inline( struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state **cached_state, u64 extent_start, size_t count, struct btrfs_ioctl_encoded_io_args *encoded, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; unsigned long ptr; void *tmp; ssize_t ret; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { if (ret > 0) { /* The extent item disappeared? */ ret = -EIO; } goto out; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); ptr = btrfs_file_extent_inline_start(item); encoded->len = min_t(u64, extent_start + ram_bytes, inode->vfs_inode.i_size) - iocb->ki_pos; ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) goto out; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); if (inline_size > count) { ret = -ENOBUFS; goto out; } count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; } else { count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; ptr += iocb->ki_pos - extent_start; } tmp = kmalloc(count, GFP_NOFS); if (!tmp) { ret = -ENOMEM; goto out; } read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); if (ret != count) ret = -EFAULT; kfree(tmp); out: btrfs_free_path(path); return ret; } struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; }; static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the * atomic_dec_return() or io_wait_event() in * btrfs_encoded_read_regular_fill_pages() to ensure that this * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .pending = ATOMIC_INIT(1), }; unsigned long i = 0; struct btrfs_bio *bbio; init_waitqueue_head(&priv.wait); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; do { size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); btrfs_submit_bio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; } i++; disk_bytenr += bytes; disk_io_size -= bytes; } while (disk_io_size); atomic_inc(&priv.pending); btrfs_submit_bio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ return blk_status_to_errno(READ_ONCE(priv.status)); } static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state **cached_state, u64 disk_bytenr, u64 disk_io_size, size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; struct page **pages; unsigned long nr_pages, i; u64 cur; size_t page_offset; ssize_t ret; nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; } ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, disk_io_size, pages); if (ret) goto out; unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { i = 0; page_offset = 0; } else { i = (iocb->ki_pos - start) >> PAGE_SHIFT; page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); } cur = 0; while (cur < count) { size_t bytes = min_t(size_t, count - cur, PAGE_SIZE - page_offset); if (copy_page_to_iter(pages[i], page_offset, bytes, iter) != bytes) { ret = -EFAULT; goto out; } i++; cur += bytes; page_offset = 0; } ret = count; out: for (i = 0; i < nr_pages; i++) { if (pages[i]) __free_page(pages[i]); } kfree(pages); return ret; } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); u64 start, lockend, disk_bytenr, disk_io_size; struct extent_state *cached_state = NULL; struct extent_map *em; bool unlocked = false; file_accessed(iocb->ki_filp); btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); /* * We don't know how long the extent containing iocb->ki_pos is, but if * it's compressed we know that it won't be longer than this. */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; for (;;) { struct btrfs_ordered_extent *ordered; ret = btrfs_wait_ordered_range(inode, start, lockend - start + 1); if (ret) goto out_unlock_inode; lock_extent(io_tree, start, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); unlock_extent(io_tree, start, lockend, &cached_state); cond_resched(); } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; } if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* * For inline extents we get everything we need out of the * extent item. */ free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, &cached_state, extent_start, count, encoded, &unlocked); goto out; } /* * We only want to return up to EOF even if the extent extends beyond * that. */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } disk_io_size = em->disk_num_bytes; count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; count = start + disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; if (disk_bytenr == EXTENT_MAP_HOLE) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, &cached_state, disk_bytenr, disk_io_size, count, encoded->compression, &unlocked); } out: if (ret >= 0) iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: if (!unlocked) unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; unsigned long nr_folios, i; struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; ssize_t ret; switch (encoded->compression) { case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: compression = BTRFS_COMPRESS_ZLIB; break; case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: compression = BTRFS_COMPRESS_ZSTD; break; case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: /* The sector size must match for LZO. */ if (encoded->compression - BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != fs_info->sectorsize_bits) return -EINVAL; compression = BTRFS_COMPRESS_LZO; break; default: return -EINVAL; } if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL; /* * Compressed extents should always have checksums, so error out if we * have a NOCOW file or inode was created while mounted with NODATASUM. */ if (inode->flags & BTRFS_INODE_NODATASUM) return -EINVAL; orig_count = iov_iter_count(from); /* The extent size must be sane. */ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) return -EINVAL; /* * The compressed data must be smaller than the decompressed data. * * It's of course possible for data to compress to larger or the same * size, but the buffered I/O path falls back to no compression for such * data, and we don't want to break any assumptions by creating these * extents. * * Note that this is less strict than the current check we have that the * compressed data must be at least one sector smaller than the * decompressed data. We only want to enforce the weaker requirement * from old kernels that it is at least one byte smaller. */ if (orig_count >= encoded->unencoded_len) return -EINVAL; /* The extent must start on a sector boundary. */ start = iocb->ki_pos; if (!IS_ALIGNED(start, fs_info->sectorsize)) return -EINVAL; /* * The extent must end on a sector boundary. However, we allow a write * which ends at or extends i_size to have an unaligned length; we round * up the extent size and set i_size to the unaligned end. */ if (start + encoded->len < inode->vfs_inode.i_size && !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) return -EINVAL; /* Finally, the offset in the unencoded data must be sector-aligned. */ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) return -EINVAL; num_bytes = ALIGN(encoded->len, fs_info->sectorsize); ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); end = start + num_bytes - 1; /* * If the extent cannot be inline, the compressed data on disk must be * sector-aligned. For convenience, we extend it with zeroes if it * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); if (!folios) return -ENOMEM; for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); if (!folios[i]) { ret = -ENOMEM; goto out_folios; } kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(kaddr); } for (;;) { struct btrfs_ordered_extent *ordered; ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } /* * We don't use the higher-level delalloc space functions because our * num_bytes and disk_num_bytes are different. */ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); if (ret) goto out_unlock; ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); if (ret) goto out_free_data_space; ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, false); if (ret) goto out_qgroup_free_data; /* Try an inline extent first. */ if (encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { ret = __cow_file_range_inline(inode, start, encoded->len, orig_count, compression, folios[0], true); if (ret <= 0) { if (ret == 0) ret = orig_count; goto out_delalloc_release; } } ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, disk_num_bytes, 0, 0, &ins, 1, 1); if (ret) goto out_delalloc_release; extent_reserved = true; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = num_bytes; file_extent.ram_bytes = ram_bytes; file_extent.offset = encoded->unencoded_offset; file_extent.compression = compression; em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, (1 << BTRFS_ORDERED_ENCODED) | (1 << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented * bytes_may_use. */ if (!extent_reserved) btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) folio_put(folios[i]); } kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; return ret; } #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a * negative errno on failure. */ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, bool is_block_group) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp, *entry; struct rb_node **p; struct rb_node *parent = NULL; sp = kmalloc(sizeof(*sp), GFP_NOFS); if (!sp) return -ENOMEM; sp->ptr = ptr; sp->inode = inode; sp->is_block_group = is_block_group; sp->bg_extent_count = 1; spin_lock(&fs_info->swapfile_pins_lock); p = &fs_info->swapfile_pins.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_swapfile_pin, node); if (sp->ptr < entry->ptr || (sp->ptr == entry->ptr && sp->inode < entry->inode)) { p = &(*p)->rb_left; } else if (sp->ptr > entry->ptr || (sp->ptr == entry->ptr && sp->inode > entry->inode)) { p = &(*p)->rb_right; } else { if (is_block_group) entry->bg_extent_count++; spin_unlock(&fs_info->swapfile_pins_lock); kfree(sp); return 1; } } rb_link_node(&sp->node, parent, p); rb_insert_color(&sp->node, &fs_info->swapfile_pins); spin_unlock(&fs_info->swapfile_pins_lock); return 0; } /* Free all of the entries pinned by this swapfile. */ static void btrfs_free_swapfile_pins(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp; struct rb_node *node, *next; spin_lock(&fs_info->swapfile_pins_lock); node = rb_first(&fs_info->swapfile_pins); while (node) { next = rb_next(node); sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (sp->inode == inode) { rb_erase(&sp->node, &fs_info->swapfile_pins); if (sp->is_block_group) { btrfs_dec_block_group_swap_extents(sp->ptr, sp->bg_extent_count); btrfs_put_block_group(sp->ptr); } kfree(sp); } node = next; } spin_unlock(&fs_info->swapfile_pins_lock); } struct btrfs_swap_info { u64 start; u64 block_start; u64 block_len; u64 lowest_ppage; u64 highest_ppage; unsigned long nr_pages; int nr_extents; }; static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; /* * Our swapfile may have had its size extended after the swap header was * written. In that case activating the swapfile should not go beyond * the max size set in the swap header. */ if (bsi->nr_pages >= sis->max) return 0; max_pages = sis->max - bsi->nr_pages; first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) first_ppage_reported++; if (bsi->lowest_ppage > first_ppage_reported) bsi->lowest_ppage = first_ppage_reported; if (bsi->highest_ppage < (next_ppage - 1)) bsi->highest_ppage = next_ppage - 1; ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); if (ret < 0) return ret; bsi->nr_extents += ret; bsi->nr_pages += nr_pages; return 0; } static void btrfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); btrfs_free_swapfile_pins(inode); atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; int ret = 0; u64 isize; u64 start; /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) return ret; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); return -EINVAL; } /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't * run concurrently while we are mapping the swap extents, and * fs_info->swapfile_pins prevents them from running while the swap * file is active and moving the extents. Note that this also prevents * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; } /* * Prevent snapshot creation while we are activating the swap file. * We do not want to race with snapshot creation. If snapshot creation * already started before we bumped nr_swapfiles from 0 to 1 and * completes before the first write into the swap file after it is * activated, than that write would fallback to COW. */ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); return -EINVAL; } /* * Snapshots can create extents which require COW even if NODATACOW is * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. * * It is possible that subvolume is marked for deletion but still not * removed yet. To prevent this race, we check the root status before * activating the swapfile. */ spin_lock(&root->root_item_lock); if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); return -EPERM; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; struct btrfs_block_group *bg; u64 len = isize - start; em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->disk_bytenr == EXTENT_MAP_HOLE) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } if (em->disk_bytenr == EXTENT_MAP_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be * big enough to store more than the swap header, but in * case something changes in the future, let's catch it * here rather than later. */ btrfs_warn(fs_info, "swapfile must not be inline"); ret = -EINVAL; goto out; } if (extent_map_is_compressed(em)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } logical_block_start = extent_map_block_start(em) + (start - em->start); len = min(len, em->len - (start - em->start)); free_extent_map(em); em = NULL; ret = can_nocow_extent(inode, start, &len, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { ret = 0; } else { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; goto out; } map = btrfs_get_chunk_map(fs_info, logical_block_start, len); if (IS_ERR(map)) { ret = PTR_ERR(map); goto out; } if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; goto out; } if (device == NULL) { device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { btrfs_warn(fs_info, "could not find block group containing swapfile"); ret = -EINVAL; goto out; } if (!btrfs_inc_block_group_swap_extents(bg)) { btrfs_warn(fs_info, "block group for swapfile at %llu is read-only%s", bg->start, atomic_read(&fs_info->scrubs_running) ? " (scrub running)" : ""); btrfs_put_block_group(bg); ret = -EINVAL; goto out; } ret = btrfs_add_swapfile_pin(inode, bg, true); if (ret) { btrfs_put_block_group(bg); if (ret == 1) ret = 0; else goto out; } if (bsi.block_len && bsi.block_start + bsi.block_len == physical_block_start) { bsi.block_len += len; } else { if (bsi.block_len) { ret = btrfs_add_swap_extent(sis, &bsi); if (ret) goto out; } bsi.start = start; bsi.block_start = physical_block_start; bsi.block_len = len; } start += len; } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); if (ret) return ret; if (device) sis->bdev = device->bdev; *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else static void btrfs_swap_deactivate(struct file *file) { } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } #endif /* * Update the number of bytes used in the VFS' inode. When we replace extents in * a range (clone, dedupe, fallocate's zero range), we must update the number of * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls * always get a correct value. */ void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes) { if (add_bytes == del_bytes) return; spin_lock(&inode->lock); if (del_bytes > 0) inode_sub_bytes(&inode->vfs_inode, del_bytes); if (add_bytes > 0) inode_add_bytes(&inode->vfs_inode, add_bytes); spin_unlock(&inode->lock); } /* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. * @start: Start offset of the file range, should be sector size aligned. * @end: End offset (inclusive) of the file range, its value +1 should be * sector size aligned. * * This should typically be used for cases where we locked an inode's VFS lock in * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, * we have flushed all delalloc in the range, we have waited for all ordered * extents in the range to complete and finally we have locked the file range in * the inode's io_tree. */ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered; if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); } ASSERT(ordered == NULL); } /* * Find the first inode with a minimum number. * * @root: The root to search for. * @min_ino: The minimum inode number. * * Find the first inode in the @root with a number >= @min_ino and return it. * Returns NULL if no such inode found. */ struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) { struct btrfs_inode *inode; unsigned long from = min_ino; xa_lock(&root->inodes); while (true) { inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); if (!inode) break; if (igrab(&inode->vfs_inode)) break; from = btrfs_ino(inode) + 1; cond_resched_lock(&root->inodes.xa_lock); } xa_unlock(&root->inodes); return inode; } static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, .create = btrfs_create, .unlink = btrfs_unlink, .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, .rename = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, }; /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume * these extents won't change over the life of the file and they * use the bmap result to do IO directly to the drive. * * the btrfs bmap call would return logical addresses that aren't * suitable for IO and they also will change frequently as COW * operations happen. So, swapfile + btrfs == corruption. * * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, }; |
151 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | /* SPDX-License-Identifier: GPL-2.0 */ /* taskstats_kern.h - kernel header for per-task statistics interface * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 */ #ifndef _LINUX_TASKSTATS_KERN_H #define _LINUX_TASKSTATS_KERN_H #include <linux/taskstats.h> #include <linux/sched/signal.h> #include <linux/slab.h> #ifdef CONFIG_TASKSTATS extern struct kmem_cache *taskstats_cache; extern struct mutex taskstats_exit_mutex; static inline void taskstats_tgid_free(struct signal_struct *sig) { if (sig->stats) kmem_cache_free(taskstats_cache, sig->stats); } extern void taskstats_exit(struct task_struct *, int group_dead); extern void taskstats_init_early(void); #else static inline void taskstats_exit(struct task_struct *tsk, int group_dead) {} static inline void taskstats_tgid_free(struct signal_struct *sig) {} static inline void taskstats_init_early(void) {} #endif /* CONFIG_TASKSTATS */ #endif |
123 3983 2729 3940 82 6315 3978 360 114 76 76 32 76 76 76 75 1 76 76 71 10 3283 75 3501 3505 3240 4 163 109 268 3267 9 38 10 18 38 38 38 37 351 28 3983 3978 3984 336 34 334 73 4117 3969 360 342 17 362 717 3363 3993 2 3980 3985 3994 3504 1993 2006 6 343 845 1338 16 17 6065 6060 2803 4490 371 4378 23 547 548 8 13 297 4 296 297 297 292 5 5 6 286 292 1 7 13 1 2 11 509 1 301 9 207 207 486 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * * .. note:: * This only updates the timer expiry value and does not requeue the timer. * * There is also a variant of the function hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the * callback and the caller needs to take care of serialization. * * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffie) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_init_sleeper - initialize sleeper to the given clock * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; slack = current->timer_slack_ns; if (rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } /* * Override any slack passed by the user if under * rt contraints. */ if (rt_task(current)) delta = 0; hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/ufs/swab.h * * Copyright (C) 1997, 1998 Francois-Rene Rideau <fare@tunes.org> * Copyright (C) 1998 Jakub Jelinek <jj@ultra.linux.cz> * Copyright (C) 2001 Christoph Hellwig <hch@infradead.org> */ #ifndef _UFS_SWAB_H #define _UFS_SWAB_H /* * Notes: * HERE WE ASSUME EITHER BIG OR LITTLE ENDIAN UFSes * in case there are ufs implementations that have strange bytesexes, * you'll need to modify code here as well as in ufs_super.c and ufs_fs.h * to support them. */ enum { BYTESEX_LE, BYTESEX_BE }; static inline u64 fs64_to_cpu(struct super_block *sbp, __fs64 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return le64_to_cpu((__force __le64)n); else return be64_to_cpu((__force __be64)n); } static inline __fs64 cpu_to_fs64(struct super_block *sbp, u64 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return (__force __fs64)cpu_to_le64(n); else return (__force __fs64)cpu_to_be64(n); } static inline u32 fs32_to_cpu(struct super_block *sbp, __fs32 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return le32_to_cpu((__force __le32)n); else return be32_to_cpu((__force __be32)n); } static inline __fs32 cpu_to_fs32(struct super_block *sbp, u32 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return (__force __fs32)cpu_to_le32(n); else return (__force __fs32)cpu_to_be32(n); } static inline void fs32_add(struct super_block *sbp, __fs32 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) le32_add_cpu((__le32 *)n, d); else be32_add_cpu((__be32 *)n, d); } static inline void fs32_sub(struct super_block *sbp, __fs32 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) le32_add_cpu((__le32 *)n, -d); else be32_add_cpu((__be32 *)n, -d); } static inline u16 fs16_to_cpu(struct super_block *sbp, __fs16 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return le16_to_cpu((__force __le16)n); else return be16_to_cpu((__force __be16)n); } static inline __fs16 cpu_to_fs16(struct super_block *sbp, u16 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) return (__force __fs16)cpu_to_le16(n); else return (__force __fs16)cpu_to_be16(n); } static inline void fs16_add(struct super_block *sbp, __fs16 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) le16_add_cpu((__le16 *)n, d); else be16_add_cpu((__be16 *)n, d); } static inline void fs16_sub(struct super_block *sbp, __fs16 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) le16_add_cpu((__le16 *)n, -d); else be16_add_cpu((__be16 *)n, -d); } #endif /* _UFS_SWAB_H */ |
8 24 27 42 52 62 215 68 38 2 60 7 103 141 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 | /* * net/tipc/trace.h: TIPC tracepoints * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tipc #if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TIPC_TRACE_H #include <linux/tracepoint.h> #include "core.h" #include "link.h" #include "socket.h" #include "node.h" #define SKB_LMIN (100) #define SKB_LMAX (SKB_LMIN * 2) #define LIST_LMIN (SKB_LMIN * 3) #define LIST_LMAX (SKB_LMIN * 11) #define SK_LMIN (SKB_LMIN * 2) #define SK_LMAX (SKB_LMIN * 11) #define LINK_LMIN (SKB_LMIN) #define LINK_LMAX (SKB_LMIN * 16) #define NODE_LMIN (SKB_LMIN) #define NODE_LMAX (SKB_LMIN * 11) #ifndef __TIPC_TRACE_ENUM #define __TIPC_TRACE_ENUM enum { TIPC_DUMP_NONE = 0, TIPC_DUMP_TRANSMQ = 1, TIPC_DUMP_BACKLOGQ = (1 << 1), TIPC_DUMP_DEFERDQ = (1 << 2), TIPC_DUMP_INPUTQ = (1 << 3), TIPC_DUMP_WAKEUP = (1 << 4), TIPC_DUMP_SK_SNDQ = (1 << 8), TIPC_DUMP_SK_RCVQ = (1 << 9), TIPC_DUMP_SK_BKLGQ = (1 << 10), TIPC_DUMP_ALL = 0xffffu }; #endif /* Link & Node FSM states: */ #define state_sym(val) \ __print_symbolic(val, \ {(0xe), "ESTABLISHED" },\ {(0xe << 4), "ESTABLISHING" },\ {(0x1 << 8), "RESET" },\ {(0x2 << 12), "RESETTING" },\ {(0xd << 16), "PEER_RESET" },\ {(0xf << 20), "FAILINGOVER" },\ {(0xc << 24), "SYNCHING" },\ {(0xdd), "SELF_DOWN_PEER_DOWN" },\ {(0xaa), "SELF_UP_PEER_UP" },\ {(0xd1), "SELF_DOWN_PEER_LEAVING" },\ {(0xac), "SELF_UP_PEER_COMING" },\ {(0xca), "SELF_COMING_PEER_UP" },\ {(0x1d), "SELF_LEAVING_PEER_DOWN" },\ {(0xf0), "FAILINGOVER" },\ {(0xcc), "SYNCHING" }) /* Link & Node FSM events: */ #define evt_sym(val) \ __print_symbolic(val, \ {(0xec1ab1e), "ESTABLISH_EVT" },\ {(0x9eed0e), "PEER_RESET_EVT" },\ {(0xfa110e), "FAILURE_EVT" },\ {(0x10ca1d0e), "RESET_EVT" },\ {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\ {(0xfa110ede), "FAILOVER_END_EVT" },\ {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\ {(0xc1ccede), "SYNCH_END_EVT" },\ {(0xece), "SELF_ESTABL_CONTACT_EVT" },\ {(0x1ce), "SELF_LOST_CONTACT_EVT" },\ {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\ {(0x91ce), "PEER_LOST_CONTACT_EVT" },\ {(0xfbe), "FAILOVER_BEGIN_EVT" },\ {(0xfee), "FAILOVER_END_EVT" },\ {(0xcbe), "SYNCH_BEGIN_EVT" },\ {(0xcee), "SYNCH_END_EVT" }) /* Bearer, net device events: */ #define dev_evt_sym(val) \ __print_symbolic(val, \ {(NETDEV_CHANGE), "NETDEV_CHANGE" },\ {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\ {(NETDEV_UP), "NETDEV_UP" },\ {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\ {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\ {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\ {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" }) extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly; int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf); int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf); int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf); int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf); int tipc_node_dump(struct tipc_node *n, bool more, char *buf); bool tipc_sk_filtering(struct sock *sk); DECLARE_EVENT_CLASS(tipc_skb_class, TP_PROTO(struct sk_buff *skb, bool more, const char *header), TP_ARGS(skb, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN) ), TP_fast_assign( __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ) #define DEFINE_SKB_EVENT(name) \ DEFINE_EVENT(tipc_skb_class, name, \ TP_PROTO(struct sk_buff *skb, bool more, const char *header), \ TP_ARGS(skb, more, header)) DEFINE_SKB_EVENT(tipc_skb_dump); DEFINE_SKB_EVENT(tipc_proto_build); DEFINE_SKB_EVENT(tipc_proto_rcv); DECLARE_EVENT_CLASS(tipc_list_class, TP_PROTO(struct sk_buff_head *list, bool more, const char *header), TP_ARGS(list, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN) ), TP_fast_assign( __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ); #define DEFINE_LIST_EVENT(name) \ DEFINE_EVENT(tipc_list_class, name, \ TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \ TP_ARGS(list, more, header)) DEFINE_LIST_EVENT(tipc_list_dump); DECLARE_EVENT_CLASS(tipc_sk_class, TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, const char *header), TP_ARGS(sk, skb, dqueues, header), TP_STRUCT__entry( __string(header, header) __field(u32, portid) __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN) __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1) ), TP_fast_assign( __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) tipc_skb_dump(skb, false, __get_str(skb_buf)); else *(__get_str(skb_buf)) = '\0'; ), TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header), __get_str(skb_buf), __get_str(buf)) ); #define DEFINE_SK_EVENT_FILTER(name) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk))) DEFINE_SK_EVENT_FILTER(tipc_sk_dump); DEFINE_SK_EVENT_FILTER(tipc_sk_create); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg); DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream); DEFINE_SK_EVENT_FILTER(tipc_sk_poll); DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv); DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx); DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_release); DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown); #define DEFINE_SK_EVENT_FILTER_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk) && (cond))) DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb)); DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb)); DECLARE_EVENT_CLASS(tipc_link_class, TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), TP_ARGS(l, dqueues, header), TP_STRUCT__entry( __string(header, header) __array(char, name, TIPC_MAX_LINK_NAME) __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN) ), TP_fast_assign( __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), TP_printk("<%s> %s\n%s", __entry->name, __get_str(header), __get_str(buf)) ); #define DEFINE_LINK_EVENT(name) \ DEFINE_EVENT(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header)) DEFINE_LINK_EVENT(tipc_link_dump); DEFINE_LINK_EVENT(tipc_link_conges); DEFINE_LINK_EVENT(tipc_link_timeout); DEFINE_LINK_EVENT(tipc_link_reset); #define DEFINE_LINK_EVENT_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header), \ TP_CONDITION(cond)) DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l)); DECLARE_EVENT_CLASS(tipc_link_transmq_class, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_STRUCT__entry( __array(char, name, TIPC_MAX_LINK_NAME) __field(u16, from) __field(u16, to) __field(u32, len) __field(u16, fseqno) __field(u16, lseqno) ), TP_fast_assign( memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); __entry->fseqno = __entry->len ? msg_seqno(buf_msg(skb_peek(tq))) : 0; __entry->lseqno = __entry->len ? msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DECLARE_EVENT_CLASS(tipc_node_class, TP_PROTO(struct tipc_node *n, bool more, const char *header), TP_ARGS(n, more, header), TP_STRUCT__entry( __string(header, header) __field(u32, addr) __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN) ), TP_fast_assign( __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header), __get_str(buf)) ); #define DEFINE_NODE_EVENT(name) \ DEFINE_EVENT(tipc_node_class, name, \ TP_PROTO(struct tipc_node *n, bool more, const char *header), \ TP_ARGS(n, more, header)) DEFINE_NODE_EVENT(tipc_node_dump); DEFINE_NODE_EVENT(tipc_node_create); DEFINE_NODE_EVENT(tipc_node_delete); DEFINE_NODE_EVENT(tipc_node_lost_contact); DEFINE_NODE_EVENT(tipc_node_timeout); DEFINE_NODE_EVENT(tipc_node_link_up); DEFINE_NODE_EVENT(tipc_node_link_down); DEFINE_NODE_EVENT(tipc_node_reset_links); DEFINE_NODE_EVENT(tipc_node_check_state); DECLARE_EVENT_CLASS(tipc_fsm_class, TP_PROTO(const char *name, u32 os, u32 ns, int evt), TP_ARGS(name, os, ns, evt), TP_STRUCT__entry( __string(name, name) __field(u32, os) __field(u32, ns) __field(u32, evt) ), TP_fast_assign( __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; ), TP_printk("<%s> %s--(%s)->%s\n", __get_str(name), state_sym(__entry->os), evt_sym(__entry->evt), state_sym(__entry->ns)) ); #define DEFINE_FSM_EVENT(fsm_name) \ DEFINE_EVENT(tipc_fsm_class, fsm_name, \ TP_PROTO(const char *name, u32 os, u32 ns, int evt), \ TP_ARGS(name, os, ns, evt)) DEFINE_FSM_EVENT(tipc_link_fsm); DEFINE_FSM_EVENT(tipc_node_fsm); TRACE_EVENT(tipc_l2_device_event, TP_PROTO(struct net_device *dev, struct tipc_bearer *b, unsigned long evt), TP_ARGS(dev, b, evt), TP_STRUCT__entry( __string(dev_name, dev->name) __string(b_name, b->name) __field(unsigned long, evt) __field(u8, b_up) __field(u8, carrier) __field(u8, oper) ), TP_fast_assign( __assign_str(dev_name); __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); __entry->oper = netif_oper_up(dev); ), TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n", dev_evt_sym(__entry->evt), __get_str(dev_name), __get_str(b_name), (__entry->oper) ? "up" : "down", (__entry->carrier) ? "ok" : "notok", (__entry->b_up) ? "up" : "down") ); #endif /* _TIPC_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
4 1 1 2 1 1 1 1 1 2 1 1 1 1 15 14 15 2 2 2 2 5 3 3 3 3 1 2 1 1 1 24 11 24 2 419 424 4 2 2 302 302 4 28 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <linux/btf.h> #include <linux/bug.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #ifdef CONFIG_CGROUP_BPF #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; struct rb_root root; struct list_head list; }; static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) { return container_of(map, struct bpf_cgroup_storage_map, map); } static bool attach_type_isolated(const struct bpf_map *map) { return map->key_size == sizeof(struct bpf_cgroup_storage_key); } static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, const void *_key1, const void *_key2) { if (attach_type_isolated(&map->map)) { const struct bpf_cgroup_storage_key *key1 = _key1; const struct bpf_cgroup_storage_key *key2 = _key2; if (key1->cgroup_inode_id < key2->cgroup_inode_id) return -1; else if (key1->cgroup_inode_id > key2->cgroup_inode_id) return 1; else if (key1->attach_type < key2->attach_type) return -1; else if (key1->attach_type > key2->attach_type) return 1; } else { const __u64 *cgroup_inode_id1 = _key1; const __u64 *cgroup_inode_id2 = _key2; if (*cgroup_inode_id1 < *cgroup_inode_id2) return -1; else if (*cgroup_inode_id1 > *cgroup_inode_id2) return 1; } return 0; } struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; if (!locked) spin_lock_bh(&map->lock); node = root->rb_node; while (node) { struct bpf_cgroup_storage *storage; storage = container_of(node, struct bpf_cgroup_storage, node); switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; case 1: node = node->rb_right; break; default: if (!locked) spin_unlock_bh(&map->lock); return storage; } } if (!locked) spin_unlock_bh(&map->lock); return NULL; } static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage *storage) { struct rb_root *root = &map->root; struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { struct bpf_cgroup_storage *this; this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; case 1: new = &((*new)->rb_right); break; default: return -EEXIST; } } rb_link_node(&storage->node, parent, new); rb_insert_color(&storage->node, root); return 0; } static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); if (!storage) return NULL; return &READ_ONCE(storage->buf)->data[0]; } static long cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, key, false); if (!storage) return -ENOENT; if (flags & BPF_F_LOCK) { copy_map_value_locked(map, storage->buf->data, value, false); return 0; } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; memcpy(&new->data[0], value, map->value_size); check_and_init_map_value(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); return 0; } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; rcu_read_lock(); storage = cgroup_storage_lookup(map, key, false); if (!storage) { rcu_read_unlock(); return -ENOENT; } /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(storage->percpu_buf, cpu), size); off += size; } rcu_read_unlock(); return 0; } int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; if (map_flags != BPF_ANY && map_flags != BPF_EXIST) return -EINVAL; rcu_read_lock(); storage = cgroup_storage_lookup(map, key, false); if (!storage) { rcu_read_unlock(); return -ENOENT; } /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), value + off, size); off += size; } rcu_read_unlock(); return 0; } static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); if (list_empty(&map->list)) goto enoent; if (key) { storage = cgroup_storage_lookup(map, key, true); if (!storage) goto enoent; storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); if (attach_type_isolated(&map->map)) { struct bpf_cgroup_storage_key *next = _next_key; *next = storage->key; } else { __u64 *next = _next_key; *next = storage->key.cgroup_inode_id; } return 0; enoent: spin_unlock_bh(&map->lock); return -ENOENT; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu * is the same as other local storages. */ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) max_value_size = min_t(__u32, max_value_size, PCPU_MIN_UNIT_SIZE); if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) return ERR_PTR(-EINVAL); if (attr->value_size > max_value_size) return ERR_PTR(-E2BIG); if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); spin_lock_init(&map->lock); map->root = RB_ROOT; INIT_LIST_HEAD(&map->list); return &map->map; } static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct list_head *storages = &map->list; struct bpf_cgroup_storage *storage, *stmp; cgroup_lock(); list_for_each_entry_safe(storage, stmp, storages, list_map) { bpf_cgroup_storage_unlink(storage); bpf_cgroup_storage_free(storage); } cgroup_unlock(); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); bpf_map_area_free(map); } static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { if (attach_type_isolated(map)) { struct btf_member *m; u32 offset, size; /* Key is expected to be of struct bpf_cgroup_storage_key type, * which is: * struct bpf_cgroup_storage_key { * __u64 cgroup_inode_id; * __u32 attach_type; * }; */ /* * Key_type must be a structure with two fields. */ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || BTF_INFO_VLEN(key_type->info) != 2) return -EINVAL; /* * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; /* * The second field must be a 32 bit integer at 64 bit offset. */ m++; offset = offsetof(struct bpf_cgroup_storage_key, attach_type); size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { u32 int_data; /* * Key is expected to be u64, which stores the cgroup_inode_id */ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) return -EINVAL; } return 0; } static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype; struct bpf_cgroup_storage *storage; int cpu; rcu_read_lock(); storage = cgroup_storage_lookup(map_to_storage(map), key, false); if (!storage) { rcu_read_unlock(); return; } btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) { seq_puts(m, ": "); btf_type_seq_show(map->btf, map->btf_value_type_id, &READ_ONCE(storage->buf)->data[0], m); seq_puts(m, "\n"); } else { seq_puts(m, ": {\n"); for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(storage->percpu_buf, cpu), m); seq_puts(m, "\n"); } seq_puts(m, "}\n"); } rcu_read_unlock(); } static u64 cgroup_storage_map_usage(const struct bpf_map *map) { /* Currently the dynamically allocated elements are not counted. */ return sizeof(struct bpf_cgroup_storage_map); } BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = cgroup_storage_get_next_key, .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, .map_mem_usage = cgroup_storage_map_usage, .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) return -EBUSY; aux->cgroup_storage[stype] = _map; return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) { size_t size; if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { size = sizeof(struct bpf_storage_buffer) + map->value_size; *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, PAGE_SIZE) >> PAGE_SHIFT; } else { size = map->value_size; *pages = round_up(round_up(size, 8) * num_possible_cpus(), PAGE_SIZE) >> PAGE_SHIFT; } return size; } struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { const gfp_t gfp = __GFP_ZERO | GFP_USER; struct bpf_cgroup_storage *storage; struct bpf_map *map; size_t size; u32 pages; map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; size = bpf_cgroup_storage_calculate_size(map, &pages); storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), gfp, map->numa_node); if (!storage) goto enomem; if (stype == BPF_CGROUP_STORAGE_SHARED) { storage->buf = bpf_map_kmalloc_node(map, size, gfp, map->numa_node); if (!storage->buf) goto enomem; check_and_init_map_value(map, storage->buf->data); } else { storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; enomem: kfree(storage); return ERR_PTR(-ENOMEM); } static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) { struct bpf_cgroup_storage *storage = container_of(rcu, struct bpf_cgroup_storage, rcu); kfree(storage->buf); kfree(storage); } static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) { struct bpf_cgroup_storage *storage = container_of(rcu, struct bpf_cgroup_storage, rcu); free_percpu(storage->percpu_buf); kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { enum bpf_cgroup_storage_type stype; struct bpf_map *map; if (!storage) return; map = &storage->map->map; stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); else call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type) { struct bpf_cgroup_storage_map *map; if (!storage) return; storage->key.attach_type = type; storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); list_add(&storage->list_map, &map->list); list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) { struct bpf_cgroup_storage_map *map; struct rb_root *root; if (!storage) return; map = storage->map; spin_lock_bh(&map->lock); root = &map->root; rb_erase(&storage->node, root); list_del(&storage->list_map); list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } #endif |
2 1 3 2 6 1 5 1 1 94 1 95 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * export.c */ /* * This file implements code to make Squashfs filesystems exportable (NFS etc.) * * The export code uses an inode lookup table to map inode numbers passed in * filehandles to an inode location on disk. This table is stored compressed * into metadata blocks. A second index table is used to locate these. This * second index table for speed of access (and because it is small) is read at * mount time and cached in memory. * * The inode lookup table is used only by the export code, inode disk * locations are directly encoded in directories, enabling direct access * without an intermediate lookup for all operations except the export ops. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" /* * Look-up inode number (ino) in table, returning the inode location. */ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) { struct squashfs_sb_info *msblk = sb->s_fs_info; int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); u64 start; __le64 ino; int err; TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); if (ino_num == 0 || (ino_num - 1) >= msblk->inodes) return -EINVAL; start = le64_to_cpu(msblk->inode_lookup_table[blk]); err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); if (err < 0) return err; TRACE("squashfs_inode_lookup, inode = 0x%llx\n", (u64) le64_to_cpu(ino)); return le64_to_cpu(ino); } static struct dentry *squashfs_export_iget(struct super_block *sb, unsigned int ino_num) { long long ino; struct dentry *dentry = ERR_PTR(-ENOENT); TRACE("Entered squashfs_export_iget\n"); ino = squashfs_inode_lookup(sb, ino_num); if (ino >= 0) dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); return dentry; } static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) || fh_len < 2) return NULL; return squashfs_export_iget(sb, fid->i32.ino); } static struct dentry *squashfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) return NULL; return squashfs_export_iget(sb, fid->i32.parent_ino); } static struct dentry *squashfs_get_parent(struct dentry *child) { struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); } /* * Read uncompressed inode lookup table indexes off disk into memory */ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes); int n; __le64 *table; u64 start, end; TRACE("In read_inode_lookup_table, length %d\n", length); /* Sanity check values */ /* there should always be at least one inode */ if (inodes == 0) return ERR_PTR(-EINVAL); /* * The computed size of the lookup table (length bytes) should exactly * match the table start and end points */ if (length != (next_table - lookup_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, lookup_table_start, length); if (IS_ERR(table)) return table; /* * table0], table[1], ... table[indexes - 1] store the locations * of the compressed inode lookup blocks. Each entry should be * less than the next (i.e. table[0] < table[1]), and the difference * between them should be SQUASHFS_METADATA_SIZE or less. * table[indexes - 1] should be less than lookup_table_start, and * again the difference should be SQUASHFS_METADATA_SIZE or less */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= lookup_table_start || (lookup_table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } return table; } const struct export_operations squashfs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = squashfs_fh_to_dentry, .fh_to_parent = squashfs_fh_to_parent, .get_parent = squashfs_get_parent }; |
6 6 6 6 6 6 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/bits.h> #include <linux/ktime.h> #include <linux/bitmap.h> #include <linux/mnt_idmapping.h> #include "super.h" #include "mds_client.h" #include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and * inodes) and for coordinating shared access to storage. Metadata is * partitioning hierarchically across a number of servers, and that * partition varies over time as the cluster adjusts the distribution * in order to balance load. * * The MDS client is primarily responsible to managing synchronous * metadata requests for operations like open, unlink, and so forth. * If there is a MDS failure, we find out about it when we (possibly * request and) receive a new MDS map, and can resubmit affected * requests. * * For the most part, though, we take advantage of a lossless * communications channel to the MDS, and do not need to worry about * timing out or resubmitting requests. * * We maintain a stateful "session" with each MDS we interact with. * Within each session, we sent periodic heartbeat messages to ensure * any capabilities or leases we have been issues remain valid. If * the session times out and goes stale, our leases and capabilities * are no longer valid. */ struct ceph_reconnect_state { struct ceph_mds_session *session; int nr_caps, nr_realms; struct ceph_pagelist *pagelist; unsigned msg_version; bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); static void ceph_cap_release_work(struct work_struct *work); static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; /* * mds reply parsing */ static int parse_reply_info_quota(void **p, void *end, struct ceph_mds_reply_info_in *info) { u8 struct_v, struct_compat; u32 struct_len; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only * understand encoding with struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; ceph_decode_64_safe(p, end, info->max_bytes, bad); ceph_decode_64_safe(p, end, info->max_files, bad); *p = end; return 0; bad: return -EIO; } /* * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, u64 features) { int err = 0; u8 struct_v = 0; if (features == (u64)-1) { u32 struct_len; u8 struct_compat; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding with struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; } ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * le32_to_cpu(info->in->fragtree.nsplits); ceph_decode_32_safe(p, end, info->symlink_len, bad); ceph_decode_need(p, end, info->symlink_len, bad); info->symlink = *p; *p += info->symlink_len; ceph_decode_copy_safe(p, end, &info->dir_layout, sizeof(info->dir_layout), bad); ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; if (features == (u64)-1) { /* inline data */ ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; /* quota */ err = parse_reply_info_quota(p, end, info); if (err < 0) goto out_bad; /* pool namespace */ ceph_decode_32_safe(p, end, info->pool_ns_len, bad); if (info->pool_ns_len > 0) { ceph_decode_need(p, end, info->pool_ns_len, bad); info->pool_ns_data = *p; *p += info->pool_ns_len; } /* btime */ ceph_decode_need(p, end, sizeof(info->btime), bad); ceph_decode_copy(p, &info->btime, sizeof(info->btime)); /* change attribute */ ceph_decode_64_safe(p, end, info->change_attr, bad); /* dir pin */ if (struct_v >= 2) { ceph_decode_32_safe(p, end, info->dir_pin, bad); } else { info->dir_pin = -ENODATA; } /* snapshot birth time, remains zero for v<=2 */ if (struct_v >= 3) { ceph_decode_need(p, end, sizeof(info->snap_btime), bad); ceph_decode_copy(p, &info->snap_btime, sizeof(info->snap_btime)); } else { memset(&info->snap_btime, 0, sizeof(info->snap_btime)); } /* snapshot count, remains zero for v<=3 */ if (struct_v >= 4) { ceph_decode_64_safe(p, end, info->rsnaps, bad); } else { info->rsnaps = 0; } if (struct_v >= 5) { u32 alen; ceph_decode_32_safe(p, end, alen, bad); while (alen--) { u32 len; /* key */ ceph_decode_32_safe(p, end, len, bad); ceph_decode_skip_n(p, end, len, bad); /* value */ ceph_decode_32_safe(p, end, len, bad); ceph_decode_skip_n(p, end, len, bad); } } /* fscrypt flag -- ignore */ if (struct_v >= 6) ceph_decode_skip_8(p, end, bad); info->fscrypt_auth = NULL; info->fscrypt_auth_len = 0; info->fscrypt_file = NULL; info->fscrypt_file_len = 0; if (struct_v >= 7) { ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); if (info->fscrypt_auth_len) { info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, GFP_KERNEL); if (!info->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, info->fscrypt_auth, info->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); if (info->fscrypt_file_len) { info->fscrypt_file = kmalloc(info->fscrypt_file_len, GFP_KERNEL); if (!info->fscrypt_file) return -ENOMEM; ceph_decode_copy_safe(p, end, info->fscrypt_file, info->fscrypt_file_len, bad); } } *p = end; } else { /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; } else info->inline_version = CEPH_INLINE_NONE; if (features & CEPH_FEATURE_MDS_QUOTA) { err = parse_reply_info_quota(p, end, info); if (err < 0) goto out_bad; } else { info->max_bytes = 0; info->max_files = 0; } info->pool_ns_len = 0; info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { ceph_decode_32_safe(p, end, info->pool_ns_len, bad); if (info->pool_ns_len > 0) { ceph_decode_need(p, end, info->pool_ns_len, bad); info->pool_ns_data = *p; *p += info->pool_ns_len; } } if (features & CEPH_FEATURE_FS_BTIME) { ceph_decode_need(p, end, sizeof(info->btime), bad); ceph_decode_copy(p, &info->btime, sizeof(info->btime)); ceph_decode_64_safe(p, end, info->change_attr, bad); } info->dir_pin = -ENODATA; /* info->snap_btime and info->rsnaps remain zero */ } return 0; bad: err = -EIO; out_bad: return err; } static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_dirfrag **dirfrag, u64 features) { if (features == (u64)-1) { u8 struct_v, struct_compat; u32 struct_len; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; } ceph_decode_need(p, end, sizeof(**dirfrag), bad); *dirfrag = *p; *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); if (unlikely(*p > end)) goto bad; if (features == (u64)-1) *p = end; return 0; bad: return -EIO; } static int parse_reply_info_lease(void **p, void *end, struct ceph_mds_reply_lease **lease, u64 features, u32 *altname_len, u8 **altname) { u8 struct_v; u32 struct_len; void *lend; if (features == (u64)-1) { u8 struct_compat; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); } else { struct_len = sizeof(**lease); *altname_len = 0; *altname = NULL; } lend = *p + struct_len; ceph_decode_need(p, end, struct_len, bad); *lease = *p; *p += sizeof(**lease); if (features == (u64)-1) { if (struct_v >= 2) { ceph_decode_32_safe(p, end, *altname_len, bad); ceph_decode_need(p, end, *altname_len, bad); *altname = *p; *p += *altname_len; } else { *altname = NULL; *altname_len = 0; } } *p = lend; return 0; bad: return -EIO; } /* * parse a normal reply, which may contain a (dir+)dentry and/or a * target inode. */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { int err; if (info->head->is_dentry) { err = parse_reply_info_in(p, end, &info->diri, features); if (err < 0) goto out_bad; err = parse_reply_info_dir(p, end, &info->dirfrag, features); if (err < 0) goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; err = parse_reply_info_lease(p, end, &info->dlease, features, &info->altname_len, &info->altname); if (err < 0) goto out_bad; } if (info->head->is_target) { err = parse_reply_info_in(p, end, &info->targeti, features); if (err < 0) goto out_bad; } if (unlikely(*p != end)) goto bad; return 0; bad: err = -EIO; out_bad: pr_err("problem parsing mds trace %d\n", err); return err; } /* * parse readdir results */ static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_request *req, u64 features) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; struct ceph_client *cl = req->r_mdsc->fsc->client; u32 num, i = 0; int err; err = parse_reply_info_dir(p, end, &info->dir_dir, features); if (err < 0) goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); { u16 flags = ceph_decode_16(p); info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); } if (num == 0) goto done; BUG_ON(!info->dir_entries); if ((unsigned long)(info->dir_entries + num) > (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err_client(cl, "dir contents are larger than expected\n"); WARN_ON(1); goto bad; } info->dir_nr = num; while (num) { struct inode *inode = d_inode(req->r_dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; struct fscrypt_str tname = FSTR_INIT(NULL, 0); struct fscrypt_str oname = FSTR_INIT(NULL, 0); struct ceph_fname fname; u32 altname_len, _name_len; u8 *altname, *_name; /* dentry */ ceph_decode_32_safe(p, end, _name_len, bad); ceph_decode_need(p, end, _name_len, bad); _name = *p; *p += _name_len; doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name); if (info->hash_order) rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, _name, _name_len); /* dentry lease */ err = parse_reply_info_lease(p, end, &rde->lease, features, &altname_len, &altname); if (err) goto out_bad; /* * Try to dencrypt the dentry names and update them * in the ceph_mds_reply_dir_entry struct. */ fname.dir = inode; fname.name = _name; fname.name_len = _name_len; fname.ctext = altname; fname.ctext_len = altname_len; /* * The _name_len maybe larger than altname_len, such as * when the human readable name length is in range of * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), * then the copy in ceph_fname_to_usr will corrupt the * data if there has no encryption key. * * Just set the no_copy flag and then if there has no * encryption key the oname.name will be assigned to * _name always. */ fname.no_copy = true; if (altname_len == 0) { /* * Set tname to _name, and this will be used * to do the base64_decode in-place. It's * safe because the decoded string should * always be shorter, which is 3/4 of origin * string. */ tname.name = _name; /* * Set oname to _name too, and this will be * used to do the dencryption in-place. */ oname.name = _name; oname.len = _name_len; } else { /* * This will do the decryption only in-place * from altname cryptext directly. */ oname.name = altname; oname.len = altname_len; } rde->is_nokey = false; err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); if (err) { pr_err_client(cl, "unable to decode %.*s, got %d\n", _name_len, _name, err); goto out_bad; } rde->name = oname.name; rde->name_len = oname.len; /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; /* ceph_readdir_prepopulate() will update it */ rde->offset = 0; i++; num--; } done: /* Skip over any unrecognized fields */ *p = end; return 0; bad: err = -EIO; out_bad: pr_err_client(cl, "problem parsing dir contents %d\n", err); return err; } /* * parse fcntl F_GETLK results */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; info->filelock_reply = *p; /* Skip over any unrecognized fields */ *p = end; return 0; bad: return -EIO; } #if BITS_PER_LONG == 64 #define DELEGATED_INO_AVAILABLE xa_mk_value(1) static int ceph_parse_deleg_inos(void **p, void *end, struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; u32 sets; ceph_decode_32_safe(p, end, sets, bad); doutc(cl, "got %u sets of delegated inodes\n", sets); while (sets--) { u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); /* Don't accept a delegation of system inodes */ if (start < CEPH_INO_SYSTEM_BASE) { pr_warn_ratelimited_client(cl, "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", start, len); continue; } while (len--) { int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { doutc(cl, "added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { pr_warn_client(cl, "MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; } } } return 0; bad: return -EIO; } u64 ceph_get_deleg_ino(struct ceph_mds_session *s) { unsigned long ino; void *val; xa_for_each(&s->s_delegated_inos, ino, val) { val = xa_erase(&s->s_delegated_inos, ino); if (val == DELEGATED_INO_AVAILABLE) return ino; } return 0; } int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) { return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, GFP_KERNEL); } #else /* BITS_PER_LONG == 64 */ /* * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top * and bottom words? */ static int ceph_parse_deleg_inos(void **p, void *end, struct ceph_mds_session *s) { u32 sets; ceph_decode_32_safe(p, end, sets, bad); if (sets) ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); return 0; bad: return -EIO; } u64 ceph_get_deleg_ino(struct ceph_mds_session *s) { return 0; } int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) { return 0; } #endif /* BITS_PER_LONG == 64 */ /* * parse create results */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features, struct ceph_mds_session *s) { int ret; if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { /* Malformed reply? */ info->has_create_ino = false; } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { info->has_create_ino = true; /* struct_v, struct_compat, and len */ ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(p, end, info->ino, bad); ret = ceph_parse_deleg_inos(p, end, s); if (ret) return ret; } else { /* legacy */ ceph_decode_64_safe(p, end, info->ino, bad); info->has_create_ino = true; } } else { if (*p != end) goto bad; } /* Skip over any unrecognized fields */ *p = end; return 0; bad: return -EIO; } static int parse_reply_info_getvxattr(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { u32 value_len; ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ ceph_decode_skip_32(p, end, bad); /* skip payload length */ ceph_decode_32_safe(p, end, value_len, bad); if (value_len == end - *p) { info->xattr_info.xattr_value = *p; info->xattr_info.xattr_value_len = value_len; *p = end; return value_len; } bad: return -EIO; } /* * parse extra results */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_request *req, u64 features, struct ceph_mds_session *s) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 op = le32_to_cpu(info->head->op); if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_readdir(p, end, req, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } /* * parse entire mds reply */ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, struct ceph_mds_request *req, u64 features) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; struct ceph_client *cl = s->s_mdsc->fsc->client; void *p, *end; u32 len; int err; info->head = msg->front.iov_base; p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; } /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } /* snap blob */ ceph_decode_32_safe(&p, end, len, bad); info->snapblob_len = len; info->snapblob = p; p += len; if (p != end) goto bad; return 0; bad: err = -EIO; out_bad: pr_err_client(cl, "mds parse_reply err %d\n", err); ceph_msg_dump(msg); return err; } static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { int i; kfree(info->diri.fscrypt_auth); kfree(info->diri.fscrypt_file); kfree(info->targeti.fscrypt_auth); kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; for (i = 0; i < info->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; kfree(rde->inode.fscrypt_auth); kfree(rde->inode.fscrypt_file); } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } /* * In async unlink case the kclient won't wait for the first reply * from MDS and just drop all the links and unhash the dentry and then * succeeds immediately. * * For any new create/link/rename,etc requests followed by using the * same file names we must wait for the first reply of the inflight * unlink request, or the MDS possibly will fail these following * requests with -EEXIST if the inflight async unlink request was * delayed for some reasons. * * And the worst case is that for the none async openc request it will * successfully open the file if the CDentry hasn't been unlinked yet, * but later the previous delayed async unlink request will remove the * CDenty. That means the just created file is possiblly deleted later * by accident. * * We need to wait for the inflight async unlink requests to finish * when creating new files/directories by using the same file names. */ int ceph_wait_on_conflict_unlink(struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); struct ceph_client *cl = fsc->client; struct dentry *pdentry = dentry->d_parent; struct dentry *udentry, *found = NULL; struct ceph_dentry_info *di; struct qstr dname; u32 hash = dentry->d_name.hash; int err; dname.name = dentry->d_name.name; dname.len = dentry->d_name.len; rcu_read_lock(); hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, hnode, hash) { udentry = di->dentry; spin_lock(&udentry->d_lock); if (udentry->d_name.hash != hash) goto next; if (unlikely(udentry->d_parent != pdentry)) goto next; if (!hash_hashed(&di->hnode)) goto next; if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n", dentry, dentry); if (!d_same_name(udentry, pdentry, &dname)) goto next; found = dget_dlock(udentry); spin_unlock(&udentry->d_lock); break; next: spin_unlock(&udentry->d_lock); } rcu_read_unlock(); if (likely(!found)) return 0; doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry, found, found); err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, TASK_KILLABLE); dput(found); return err; } /* * sessions */ const char *ceph_session_state_name(int s) { switch (s) { case CEPH_MDS_SESSION_NEW: return "new"; case CEPH_MDS_SESSION_OPENING: return "opening"; case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_CLOSING: return "closing"; case CEPH_MDS_SESSION_CLOSED: return "closed"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; case CEPH_MDS_SESSION_REJECTED: return "rejected"; default: return "???"; } } struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { if (refcount_inc_not_zero(&s->s_ref)) return s; return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) { if (IS_ERR_OR_NULL(s)) return; if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); WARN_ON(mutex_is_locked(&s->s_mutex)); xa_destroy(&s->s_delegated_inos); kfree(s); } } /* * called under mdsc->mutex */ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, int mds) { if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; return ceph_get_mds_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) { if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return false; else return true; } static int __verify_registered_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { if (s->s_mds >= mdsc->max_sessions || mdsc->sessions[s->s_mds] != s) return -ENOENT; return 0; } /* * create+register a new session for given mds. * called under mdsc->mutex. */ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, int mds) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *s; if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) return ERR_PTR(-EIO); if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); if (mds >= mdsc->max_sessions) { int newmax = 1 << get_count_order(mds + 1); struct ceph_mds_session **sa; doutc(cl, "realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, mdsc->max_sessions * sizeof(void *)); kfree(mdsc->sessions); } mdsc->sessions = sa; mdsc->max_sessions = newmax; } doutc(cl, "mds%d\n", mds); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); atomic_set(&s->s_cap_gen, 1); s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); INIT_LIST_HEAD(&s->s_caps); refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); xa_init(&s->s_delegated_inos); INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); INIT_LIST_HEAD(&s->s_cap_dirty); INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; fail_realloc: kfree(s); return ERR_PTR(-ENOMEM); } /* * called under mdsc->mutex */ static void __unregister_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); } /* * drop session refs in request. * * should be last request ref, or hold mdsc->mutex */ static void put_request_session(struct ceph_mds_request *req) { if (req->r_session) { ceph_put_mds_session(req->r_session); req->r_session = NULL; } } void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state) { int mds; mutex_lock(&mdsc->mutex); for (mds = 0; mds < mdsc->max_sessions; ++mds) { struct ceph_mds_session *s; s = __ceph_lookup_mds_session(mdsc, mds); if (!s) continue; if (check_state && !check_session_state(s)) { ceph_put_mds_session(s); continue; } mutex_unlock(&mdsc->mutex); cb(s); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); } void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); iput(req->r_parent); } iput(req->r_target_inode); iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) dput(req->r_old_dentry); if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have * changed between the dir mutex being dropped and * this request being freed. */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); put_cred(req->r_cred); if (req->r_mnt_idmap) mnt_idmap_put(req->r_mnt_idmap); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); kfree(req->r_fscrypt_auth); kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); kmem_cache_free(ceph_mds_request_cachep, req); } DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ static struct ceph_mds_request * lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; req = lookup_request(&mdsc->request_tree, tid); if (req) ceph_mdsc_get_request(req); return req; } /* * Register an in-flight request, and assign a tid. Link to directory * are modifying (if any). * * Called under mdsc->mutex. */ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, struct inode *dir) { struct ceph_client *cl = mdsc->fsc->client; int ret = 0; req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) { ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, req->r_num_caps); if (ret < 0) { pr_err_client(cl, "%p failed to reserve caps: %d\n", req, ret); /* set req->r_err to fail early from __do_request */ req->r_err = ret; return; } } doutc(cl, "%p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); req->r_cred = get_current_cred(); if (!req->r_mnt_idmap) req->r_mnt_idmap = &nop_mnt_idmap; if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) mdsc->oldest_tid = req->r_tid; if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); ihold(dir); req->r_unsafe_dir = dir; spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); spin_unlock(&ci->i_unsafe_lock); } } static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid); /* Never leave an unregistered request on an unsafe list! */ list_del_init(&req->r_unsafe_item); if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; while (p) { struct ceph_mds_request *next_req = rb_entry(p, struct ceph_mds_request, r_node); if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { mdsc->oldest_tid = next_req->r_tid; break; } p = rb_next(p); } } erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } if (req->r_target_inode && test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_target_item); spin_unlock(&ci->i_unsafe_lock); } if (req->r_unsafe_dir) { iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } complete_all(&req->r_safe_completion); ceph_mdsc_put_request(req); } /* * Walk back up the dentry tree until we hit a dentry representing a * non-snapshot inode. We do this using the rcu_read_lock (which must be held * when calling this) to ensure that the objects won't disappear while we're * working with them. Once we hit a candidate dentry, we attempt to take a * reference to it, and return that as the result. */ static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode *inode = NULL; while (dentry && !IS_ROOT(dentry)) { inode = d_inode_rcu(dentry); if (!inode || ceph_snap(inode) == CEPH_NOSNAP) break; dentry = dentry->d_parent; } if (inode) inode = igrab(inode); return inode; } /* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the * appropriate mds. If all else fails, choose randomly. * * Called under mdsc->mutex. */ static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, bool *random) { struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); struct ceph_client *cl = mdsc->fsc->client; if (random) *random = false; /* * is there a specific mds we should try? ignore hint if we have * no session and the mds is not up (active or recovering). */ if (req->r_resend_mds >= 0 && (__have_session(mdsc, req->r_resend_mds) || ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds); return req->r_resend_mds; } if (mode == USE_RANDOM_MDS) goto random; inode = NULL; if (req->r_inode) { if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { inode = req->r_inode; ihold(inode); } else { /* req->r_dentry is non-null for LSSNAP request */ rcu_read_lock(); inode = get_nonsnap_parent(req->r_dentry); rcu_read_unlock(); doutc(cl, "using snapdir's parent %p %llx.%llx\n", inode, ceph_vinop(inode)); } } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; rcu_read_lock(); parent = READ_ONCE(req->r_dentry->d_parent); dir = req->r_parent ? : d_inode_rcu(parent); if (!dir || dir->i_sb != mdsc->fsc->sb) { /* not this fs or parent went negative */ inode = d_inode(req->r_dentry); if (inode) ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ inode = get_nonsnap_parent(parent); doutc(cl, "using nonsnap parent %p %llx.%llx\n", inode, ceph_vinop(inode)); } else { /* dentry target */ inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ inode = igrab(dir); hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } else { ihold(inode); } } rcu_read_unlock(); } if (!inode) goto random; doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode, ceph_vinop(inode), (int)is_hash, hash, mode); ci = ceph_inode(inode); if (is_hash && S_ISDIR(inode->i_mode)) { struct ceph_inode_frag frag; int found; ceph_choose_frag(ci, hash, &frag, &found); if (found) { if (mode == USE_ANY_MDS && frag.ndist > 0) { u8 r; /* choose a random replica */ get_random_bytes(&r, 1); r %= frag.ndist; mds = frag.dist[r]; doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n", inode, ceph_vinop(inode), frag.frag, mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE && !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) { if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } } mode = USE_AUTH_MDS; } } spin_lock(&ci->i_ceph_lock); cap = NULL; if (mode == USE_AUTH_MDS) cap = ci->i_auth_cap; if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); iput(inode); goto random; } mds = cap->session->s_mds; doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); out: iput(inode); return mds; random: if (random) *random = true; mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); doutc(cl, "chose random mds%d\n", mds); return mds; } /* * session messages */ struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { pr_err("ENOMEM creating session %s msg\n", ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); return msg; } static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) static int encode_supported_features(void **p, void *end) { static const size_t count = ARRAY_SIZE(feature_bits); if (count > 0) { size_t i; size_t size = FEATURE_BYTES(count); unsigned long bit; if (WARN_ON_ONCE(*p + 4 + size > end)) return -ERANGE; ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) { bit = feature_bits[i]; ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); } *p += size; } else { if (WARN_ON_ONCE(*p + 4 > end)) return -ERANGE; ceph_encode_32(p, 0); } return 0; } static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) static int encode_metric_spec(void **p, void *end) { static const size_t count = ARRAY_SIZE(metric_bits); /* header */ if (WARN_ON_ONCE(*p + 2 > end)) return -ERANGE; ceph_encode_8(p, 1); /* version */ ceph_encode_8(p, 1); /* compat */ if (count > 0) { size_t i; size_t size = METRIC_BYTES(count); if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) return -ERANGE; /* metric spec info length */ ceph_encode_32(p, 4 + size); /* metric spec */ ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); *p += size; } else { if (WARN_ON_ONCE(*p + 4 + 4 > end)) return -ERANGE; /* metric spec info length */ ceph_encode_32(p, 4); /* metric spec */ ceph_encode_32(p, 0); } return 0; } /* * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ static struct ceph_msg * create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; int i; int extra_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; struct ceph_client *cl = mdsc->fsc->client; size_t size, count; void *p, *end; int ret; const char* metadata[][2] = { {"hostname", mdsc->nodename}, {"kernel_version", init_utsname()->release}, {"entity_id", opt->name ? : ""}, {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; /* Calculate serialized length of metadata */ extra_bytes = 4; /* map length */ for (i = 0; metadata[i][0]; ++i) { extra_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; } /* supported feature */ size = 0; count = ARRAY_SIZE(feature_bits); if (count > 0) size = FEATURE_BYTES(count); extra_bytes += 4 + size; /* metric spec */ size = 0; count = ARRAY_SIZE(metric_bits); if (count > 0) size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; /* flags, mds auth caps and oldest_client_tid */ extra_bytes += 4 + 4 + 8; /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { pr_err_client(cl, "ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; end = p + msg->front.iov_len; h = p; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * * ClientSession messages with metadata are v7 */ msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ p += sizeof(*h); /* Number of entries in the map */ ceph_encode_32(&p, metadata_key_count); /* Two length-prefixed strings for each entry in the map */ for (i = 0; metadata[i][0]; ++i) { size_t const key_len = strlen(metadata[i][0]); size_t const val_len = strlen(metadata[i][1]); ceph_encode_32(&p, key_len); memcpy(p, metadata[i][0], key_len); p += key_len; ceph_encode_32(&p, val_len); memcpy(p, metadata[i][1], val_len); p += val_len; } ret = encode_supported_features(&p, end); if (ret) { pr_err_client(cl, "encode_supported_features failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } ret = encode_metric_spec(&p, end); if (ret) { pr_err_client(cl, "encode_metric_spec failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } /* version == 5, flags */ ceph_encode_32(&p, 0); /* version == 6, mds auth caps */ ceph_encode_32(&p, 0); /* version == 7, oldest_client_tid */ ceph_encode_64(&p, mdsc->oldest_tid); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return msg; } /* * send session open request. * * called under mdsc->mutex */ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; int mstate; int mds = session->s_mds; if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) return -EIO; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds, ceph_mds_state_name(mstate)); session->s_state = CEPH_MDS_SESSION_OPENING; session->s_renew_requested = jiffies; /* send connect message */ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } /* * open sessions for any export targets for the given mds * * called under mdsc->mutex */ static struct ceph_mds_session * __open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; int ret; session = __ceph_lookup_mds_session(mdsc, target); if (!session) { session = register_session(mdsc, target); if (IS_ERR(session)) return session; } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { ret = __open_session(mdsc, session); if (ret) return ERR_PTR(ret); } return session; } struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "to mds%d\n", target); mutex_lock(&mdsc->mutex); session = __open_export_target_session(mdsc, target); mutex_unlock(&mdsc->mutex); return session; } static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; struct ceph_client *cl = mdsc->fsc->client; if (mds >= mdsc->mdsmap->possible_max_rank) return; mi = &mdsc->mdsmap->m_info[mds]; doutc(cl, "for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { ts = __open_export_target_session(mdsc, mi->export_targets[i]); ceph_put_mds_session(ts); } } void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { mutex_lock(&mdsc->mutex); __open_export_target_sessions(mdsc, session); mutex_unlock(&mdsc->mutex); } /* * session caps */ static void detach_cap_releases(struct ceph_mds_session *session, struct list_head *target) { struct ceph_client *cl = session->s_mdsc->fsc->client; lockdep_assert_held(&session->s_cap_lock); list_splice_init(&session->s_cap_releases, target); session->s_num_cap_releases = 0; doutc(cl, "mds%d\n", session->s_mds); } static void dispose_cap_releases(struct ceph_mds_client *mdsc, struct list_head *dispose) { while (!list_empty(dispose)) { struct ceph_cap *cap; /* zero out the in-progress message */ cap = list_first_entry(dispose, struct ceph_cap, session_caps); list_del(&cap->session_caps); ceph_put_cap(mdsc, cap); } } static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p; doutc(cl, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n", req->r_tid); if (req->r_target_inode) mapping_set_error(req->r_target_inode->i_mapping, -EIO); if (req->r_unsafe_dir) mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ p = rb_first(&mdsc->request_tree); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (req->r_session && req->r_session->s_mds == session->s_mds) req->r_attempts = 0; } mutex_unlock(&mdsc->mutex); } /* * Helper to safely iterate over all caps associated with a session, with * special care taken to handle a racing __ceph_remove_cap(). * * Caller must hold session s_mutex. */ int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct list_head *p; struct ceph_cap *cap; struct inode *inode, *last_inode = NULL; struct ceph_cap *old_cap = NULL; int ret; doutc(cl, "%p mds%d\n", session, session->s_mds); spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { int mds; cap = list_entry(p, struct ceph_cap, session_caps); inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; } session->s_cap_iterator = cap; mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { iput(last_inode); last_inode = NULL; } if (old_cap) { ceph_put_cap(session->s_mdsc, old_cap); old_cap = NULL; } ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); p = p->next; if (!cap->ci) { doutc(cl, "finishing cap %p removal\n", cap); BUG_ON(cap->session != session); cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&session->s_mdsc->metric.total_caps); if (cap->queue_release) __ceph_queue_cap_release(session, cap); else old_cap = cap; /* put_cap it w/o locks held */ } if (ret < 0) goto out; } ret = 0; out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); return ret; } static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); bool invalidate = false; struct ceph_cap *cap; int iputs = 0; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (cap) { doutc(cl, " removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->netfs.inode); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); } spin_unlock(&ci->i_ceph_lock); if (cap) wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); while (iputs--) iput(inode); return 0; } /* * caller must hold session s_mutex */ static void remove_session_caps(struct ceph_mds_session *session) { struct ceph_fs_client *fsc = session->s_mdsc->fsc; struct super_block *sb = fsc->sb; LIST_HEAD(dispose); doutc(fsc->client, "on %p\n", session); ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); wake_up_all(&fsc->mdsc->cap_flushing_wq); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; /* * iterate_session_caps() skips inodes that are being * deleted, we need to wait until deletions are complete. * __wait_on_freeing_inode() is designed for the job, * but it is not exported, so use lookup inode function * to access it. */ while (!list_empty(&session->s_caps)) { cap = list_entry(session->s_caps.next, struct ceph_cap, session_caps); if (cap == prev) break; prev = cap; vino = cap->ci->i_vino; spin_unlock(&session->s_cap_lock); inode = ceph_find_inode(sb, vino); iput(inode); spin_lock(&session->s_cap_lock); } } // drop cap expires and unlock s_cap_lock detach_cap_releases(session, &dispose); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); spin_unlock(&session->s_cap_lock); dispose_cap_releases(session->s_mdsc, &dispose); } enum { RECONNECT, RENEWCAPS, FORCE_RO, }; /* * wake up any threads waiting on this session's caps. if the cap is * old (didn't get renewed on the client reconnect), remove it now. * * caller must hold s_mutex. */ static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned long ev = (unsigned long)arg; if (ev == RECONNECT) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); /* mds did not re-issue stale cap */ if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) cap->issued = cap->implemented = CEPH_CAP_PIN; spin_unlock(&ci->i_ceph_lock); } else if (ev == FORCE_RO) { } wake_up_all(&ci->i_cap_wq); return 0; } static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { struct ceph_client *cl = session->s_mdsc->fsc->client; doutc(cl, "session %p mds%d\n", session, session->s_mds); ceph_iterate_session_caps(session, wake_up_session_cb, (void *)(unsigned long)ev); } /* * Send periodic message to MDS renewing all currently held caps. The * ack will reset the expiration for all caps from this session. * * caller holds s_mutex */ static int send_renew_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; int state; if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) pr_info_client(cl, "mds%d caps stale\n", session->s_mds); session->s_renew_requested = jiffies; /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); if (state < CEPH_MDS_STATE_RECONNECT) { doutc(cl, "ignoring mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); return 0; } doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } static int send_flushmsg_ack(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, u64 seq) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), seq); msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * * Called under session->s_mutex */ static void renewed_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int is_renew) { struct ceph_client *cl = mdsc->fsc->client; int was_stale; int wake = 0; spin_lock(&session->s_cap_lock); was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; if (was_stale) { if (time_before(jiffies, session->s_cap_ttl)) { pr_info_client(cl, "mds%d caps renewed\n", session->s_mds); wake = 1; } else { pr_info_client(cl, "mds%d caps still stale\n", session->s_mds); } } doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); spin_unlock(&session->s_cap_lock); if (wake) wake_up_session_caps(session, RENEWCAPS); } /* * send a session close request */ static int request_close_session(struct ceph_mds_session *session) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; doutc(cl, "mds%d state %s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); return 1; } /* * Called with s_mutex held. */ static int __close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { if (session->s_state >= CEPH_MDS_SESSION_CLOSING) return 0; session->s_state = CEPH_MDS_SESSION_CLOSING; return request_close_session(session); } static bool drop_negative_children(struct dentry *dentry) { struct dentry *child; bool all_negative = true; if (!d_is_dir(dentry)) goto out; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { if (d_really_is_positive(child)) { all_negative = false; break; } } spin_unlock(&dentry->d_lock); if (all_negative) shrink_dcache_parent(dentry); out: return all_negative; } /* * Trim old(er) caps. * * Because we can't cache an inode without one or more caps, we do * this indirectly: if a cap is unused, we prune its aliases, at which * point the inode will hopefully get dropped to. * * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ static int trim_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = mdsc->fsc->client; int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; struct ceph_cap *cap; if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { spin_unlock(&ci->i_ceph_lock); return 0; } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { if (ci->i_dirty_caps || ci->i_flushing_caps || !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; /* Note: it's possible that i_filelock_ref becomes non-zero * after dropping auth caps. It doesn't hurt because reply * of lock mds request will re-add auth caps. */ if (atomic_read(&ci->i_filelock_ref) > 0) goto out; } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ if (S_ISREG(inode->i_mode) && wanted == 0 && used == CEPH_CAP_FILE_CACHE && !(oissued & CEPH_CAP_FILE_CACHE)) { used = 0; oissued = 0; } if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ if (oissued) { /* we aren't the only cap.. just remove us */ ceph_remove_cap(mdsc, cap, true); (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); dentry = d_find_any_alias(inode); if (dentry && drop_negative_children(dentry)) { int count; dput(dentry); d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", inode, ceph_vinop(inode), cap, count); } else { dput(dentry); } return 0; } out: spin_unlock(&ci->i_ceph_lock); return 0; } /* * Trim session cap count down to some max number. */ int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps) { struct ceph_client *cl = mdsc->fsc->client; int trim_caps = session->s_nr_caps - max_caps; doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { int remaining = trim_caps; ceph_iterate_session_caps(session, trim_caps_cb, &remaining); doutc(cl, "mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); return 0; } static int check_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; int ret = 1; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); if (cf->tid <= want_flush_tid) { doutc(cl, "still flushing tid %llu <= %llu\n", cf->tid, want_flush_tid); ret = 0; } } spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_tid */ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "want %llu\n", want_flush_tid); wait_event(mdsc->cap_flushing_wq, check_caps_flush(mdsc, want_flush_tid)); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } /* * called under s_mutex */ static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; struct ceph_cap *cap; LIST_HEAD(tmp_list); int num_cap_releases; __le32 barrier, *cap_barrier; down_read(&osdc->lock); barrier = cpu_to_le32(osdc->epoch_barrier); up_read(&osdc->lock); spin_lock(&session->s_cap_lock); again: list_splice_init(&session->s_cap_releases, &tmp_list); num_cap_releases = session->s_num_cap_releases; session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); while (!list_empty(&tmp_list)) { if (!msg) { msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_SIZE, GFP_NOFS, false); if (!msg) goto out_err; head = msg->front.iov_base; head->num = cpu_to_le32(0); msg->front.iov_len = sizeof(*head); msg->hdr.version = cpu_to_le16(2); msg->hdr.compat_version = cpu_to_le16(1); } cap = list_first_entry(&tmp_list, struct ceph_cap, session_caps); list_del(&cap->session_caps); num_cap_releases--; head = msg->front.iov_base; put_unaligned_le32(get_unaligned_le32(&head->num) + 1, &head->num); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); item->migrate_seq = cpu_to_le32(cap->mseq); item->seq = cpu_to_le32(cap->issue_seq); msg->front.iov_len += sizeof(*item); ceph_put_cap(mdsc, cap); if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { // Append cap_barrier field cap_barrier = msg->front.iov_base + msg->front.iov_len; *cap_barrier = barrier; msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); msg = NULL; } } BUG_ON(num_cap_releases != 0); spin_lock(&session->s_cap_lock); if (!list_empty(&session->s_cap_releases)) goto again; spin_unlock(&session->s_cap_lock); if (msg) { // Append cap_barrier field cap_barrier = msg->front.iov_base + msg->front.iov_len; *cap_barrier = barrier; msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); } return; out_err: pr_err_client(cl, "mds%d, failed to allocate message\n", session->s_mds); spin_lock(&session->s_cap_lock); list_splice(&tmp_list, &session->s_cap_releases); session->s_num_cap_releases += num_cap_releases; spin_unlock(&session->s_cap_lock); } static void ceph_cap_release_work(struct work_struct *work) { struct ceph_mds_session *session = container_of(work, struct ceph_mds_session, s_cap_release_work); mutex_lock(&session->s_mutex); if (session->s_state == CEPH_MDS_SESSION_OPEN || session->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(session->s_mdsc, session); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; ceph_get_mds_session(session); if (queue_work(mdsc->fsc->cap_wq, &session->s_cap_release_work)) { doutc(cl, "cap release work queued\n"); } else { ceph_put_mds_session(session); doutc(cl, "failed to queue cap release work\n"); } } /* * caller holds session->s_cap_lock */ void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap) { list_add_tail(&cap->session_caps, &session->s_cap_releases); session->s_num_cap_releases++; if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) ceph_flush_cap_releases(session->s_mdsc, session); } static void ceph_cap_reclaim_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, cap_reclaim_work); int ret = ceph_trim_dentries(mdsc); if (ret == -EAGAIN) ceph_queue_cap_reclaim_work(mdsc); } void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { doutc(cl, "caps reclaim work queued\n"); } else { doutc(cl, "failed to queue caps release work\n"); } } void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) { int val; if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } } void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { doutc(cl, "caps unlink work queued\n"); } else { doutc(cl, "failed to queue caps unlink work\n"); } } static void ceph_cap_unlink_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, cap_unlink_work); struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; ci = list_first_entry(&mdsc->cap_unlink_delay_list, struct ceph_inode_info, i_cap_delay_list); list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } /* * requests */ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); unsigned int num_entries; int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, order); if (rinfo->dir_entries) break; order--; } if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; num_entries = min(num_entries, opt->max_readdir); rinfo->dir_buf_size = PAGE_SIZE << order; req->r_num_caps = num_entries + 1; req->r_args.readdir.max_entries = cpu_to_le32(num_entries); req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); return 0; } /* * Create an mds request. */ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req; req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); req->r_mdsc = mdsc; req->r_started = jiffies; req->r_start_latency = ktime_get(); req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); ktime_get_coarse_real_ts64(&req->r_stamp); req->r_op = op; req->r_direct_mode = mode; return req; } /* * return oldest (lowest) request, tid in request tree, 0 if none. * * called under mdsc->mutex. */ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) { if (RB_EMPTY_ROOT(&mdsc->request_tree)) return NULL; return rb_entry(rb_first(&mdsc->request_tree), struct ceph_mds_request, r_node); } static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { return mdsc->oldest_tid; } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { struct inode *dir = req->r_parent; struct dentry *dentry = req->r_dentry; u8 *cryptbuf = NULL; u32 len = 0; int ret = 0; /* only encode if we have parent and dentry */ if (!dir || !dentry) goto success; /* No-op unless this is encrypted */ if (!IS_ENCRYPTED(dir)) goto success; ret = ceph_fscrypt_prepare_readdir(dir); if (ret < 0) return ERR_PTR(ret); /* No key? Just ignore it. */ if (!fscrypt_has_encryption_key(dir)) goto success; if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, &len)) { WARN_ON_ONCE(1); return ERR_PTR(-ENAMETOOLONG); } /* No need to append altname if name is short enough */ if (len <= CEPH_NOHASH_NAME_MAX) { len = 0; goto success; } cryptbuf = kmalloc(len, GFP_KERNEL); if (!cryptbuf) return ERR_PTR(-ENOMEM); ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); if (ret) { kfree(cryptbuf); return ERR_PTR(ret); } success: *plen = len; return cryptbuf; } #else static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { *plen = 0; return NULL; } #endif /** * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built * @plen: returned length of string * @pbase: returned base inode number * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called * for two different purposes: * * 1) we need to build a path string to send to the MDS (for_wire == true) * 2) we need a path string for local presentation (e.g. debugfs) * (for_wire == false) * * The path is built in reverse, starting with the dentry. Walk back up toward * the root, building the path until the first non-snapped inode is reached * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, int *plen, u64 *pbase, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; struct inode *inode; char *path; int pos; unsigned seq; u64 base; if (!dentry) return ERR_PTR(-EINVAL); path = __getname(); if (!path) return ERR_PTR(-ENOMEM); retry: pos = PATH_MAX - 1; path[pos] = '\0'; seq = read_seqbegin(&rename_lock); cur = dget(dentry); for (;;) { struct dentry *parent; spin_lock(&cur->d_lock); inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur); spin_unlock(&cur->d_lock); parent = dget_parent(cur); } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { pos -= cur->d_name.len; if (pos < 0) { spin_unlock(&cur->d_lock); break; } memcpy(path + pos, cur->d_name.name, cur->d_name.len); spin_unlock(&cur->d_lock); parent = dget_parent(cur); } else { int len, ret; char buf[NAME_MAX]; /* * Proactively copy name into buf, in case we need to * present it as-is. */ memcpy(buf, cur->d_name.name, cur->d_name.len); len = cur->d_name.len; spin_unlock(&cur->d_lock); parent = dget_parent(cur); ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); if (ret < 0) { dput(parent); dput(cur); return ERR_PTR(ret); } if (fscrypt_has_encryption_key(d_inode(parent))) { len = ceph_encode_encrypted_fname(d_inode(parent), cur, buf); if (len < 0) { dput(parent); dput(cur); return ERR_PTR(len); } } pos -= len; if (pos < 0) { dput(parent); break; } memcpy(path + pos, buf, len); } dput(cur); cur = parent; /* Are we at the root? */ if (IS_ROOT(cur)) break; /* Are we out of buffer? */ if (--pos < 0) break; path[pos] = '/'; } inode = d_inode(cur); base = inode ? ceph_ino(inode) : 0; dput(cur); if (read_seqretry(&rename_lock, seq)) goto retry; if (pos < 0) { /* * A rename didn't occur, but somehow we didn't end up where * we thought we would. Throw a warning and try again. */ pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", pos); goto retry; } *pbase = base; *plen = PATH_MAX - 1 - pos; doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), base, *plen, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath, bool parent_locked) { char *path; rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } rcu_read_unlock(); path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { *pino = ceph_ino(inode); *ppathlen = 0; return 0; } dentry = d_find_alias(inode); path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; *pfreepath = true; return 0; } /* * request arguments may be specified via an inode *, a dentry *, or * an explicit ino+path. */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, u64 *ino, bool *freepath, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; if (rinode) { r = build_inode_path(rinode, ppath, pathlen, ino, freepath); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, freepath, parent_locked); doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { *ino = rino; *ppath = rpath; *pathlen = rpath ? strlen(rpath) : 0; doutc(cl, " path %.*s\n", *pathlen, rpath); } return r; } static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(p, &ts, sizeof(ts)); /* v4: gid_list */ ceph_encode_32(p, req->r_cred->group_info->ngroups); for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); /* v5: altname */ ceph_encode_32(p, req->r_altname_len); ceph_encode_copy(p, req->r_altname, req->r_altname_len); /* v6: fscrypt_auth and fscrypt_file */ if (req->r_fscrypt_auth) { u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); ceph_encode_32(p, authlen); ceph_encode_copy(p, req->r_fscrypt_auth, authlen); } else { ceph_encode_32(p, 0); } if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { ceph_encode_32(p, sizeof(__le64)); ceph_encode_64(p, req->r_fscrypt_file); } else { ceph_encode_32(p, 0); } } static inline u16 mds_supported_head_version(struct ceph_mds_session *session) { if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features)) return 1; if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) return 2; return CEPH_MDS_REQUEST_HEAD_VERSION; } static struct ceph_mds_request_head_legacy * find_legacy_request_head(void *p, u64 features) { bool legacy = !(features & CEPH_FEATURE_FS_BTIME); struct ceph_mds_request_head_old *ohead; if (legacy) return (struct ceph_mds_request_head_legacy *)p; ohead = (struct ceph_mds_request_head_old *)p; return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; } /* * called under mdsc->mutex */ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1, test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; } req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); if (IS_ERR(req->r_altname)) { msg = ERR_CAST(req->r_altname); req->r_altname = NULL; goto out_free2; } /* * For old cephs without supporting the 32bit retry/fwd feature * it will copy the raw memories directly when decoding the * requests. While new cephs will decode the head depending the * version member, so we need to make sure it will be compatible * with them both. */ if (legacy) len = sizeof(struct ceph_mds_request_head_legacy); else if (request_head_version == 1) len = sizeof(struct ceph_mds_request_head_old); else if (request_head_version == 2) len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); else len = sizeof(struct ceph_mds_request_head); /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); len += pathlen1 + pathlen2; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) len += pathlen1; if (req->r_old_dentry_drop) len += pathlen2; /* MClientRequest tail */ /* req->r_stamp */ len += sizeof(struct ceph_timespec); /* gid list */ len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); /* alternate name */ len += sizeof(u32) + req->r_altname_len; /* fscrypt_auth */ len += sizeof(u32); // fscrypt_auth if (req->r_fscrypt_auth) len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); /* fscrypt_file */ len += sizeof(u32); if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) len += sizeof(__le64); msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; } msg->hdr.tid = cpu_to_le64(req->r_tid); lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); if ((req->r_mnt_idmap != &nop_mnt_idmap) && !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) { WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op)); if (enable_unsafe_idmap) { pr_warn_once_client(cl, "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" " is not supported by MDS. UID/GID-based restrictions may" " not work properly.\n"); caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, VFSUIDT_INIT(req->r_cred->fsuid)); caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, VFSGIDT_INIT(req->r_cred->fsgid)); } else { pr_err_ratelimited_client(cl, "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" " is not supported by MDS. Fail request with -EIO.\n"); ret = -EIO; goto out_err; } } /* * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); p = msg->front.iov_base + sizeof(*lhead); } else if (request_head_version == 1) { struct ceph_mds_request_head_old *ohead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); ohead->version = cpu_to_le16(1); p = msg->front.iov_base + sizeof(*ohead); } else if (request_head_version == 2) { struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(6); nhead->version = cpu_to_le16(2); p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd); } else { struct ceph_mds_request_head *nhead = msg->front.iov_base; kuid_t owner_fsuid; kgid_t owner_fsgid; msg->hdr.version = cpu_to_le16(6); nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head)); if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) { owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, VFSUIDT_INIT(req->r_cred->fsuid)); owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, VFSGIDT_INIT(req->r_cred->fsgid)); nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid)); nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid)); } else { nhead->owner_uid = cpu_to_le32(-1); nhead->owner_gid = cpu_to_le32(-1); } p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); lhead->op = cpu_to_le32(req->r_op); lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, caller_fsuid)); lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, caller_fsgid)); lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; /* cap releases */ releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); if (req->r_dentry_drop) { ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); if (ret < 0) goto out_err; releases += ret; } if (req->r_old_dentry_drop) { ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); if (ret < 0) goto out_err; releases += ret; } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); if (drop_cap_releases) { releases = 0; p = msg->front.iov_base + req->r_request_release_offset; } lhead->num_releases = cpu_to_le16(releases); encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); msg = ERR_PTR(-ERANGE); goto out_free2; } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { msg->hdr.data_len = 0; } msg->hdr.data_off = cpu_to_le16(0); out_free2: if (freepath2) ceph_mdsc_free_path((char *)path2, pathlen2); out_free1: if (freepath1) ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; out_err: ceph_msg_put(msg); msg = ERR_PTR(ret); goto out_free2; } /* * called under mdsc->mutex if error, under no mutex if * success. */ static void complete_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { req->r_end_latency = ktime_get(); if (req->r_callback) req->r_callback(mdsc, req); complete_all(&req->r_completion); } /* * called under mdsc->mutex */ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request_head_legacy *lhead; struct ceph_mds_request_head *nhead; struct ceph_msg *msg; int flags = 0, old_max_retry; bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); /* * Avoid inifinite retrying after overflow. The client will * increase the retry count and if the MDS is old version, * so we limit to retry at most 256 times. */ if (req->r_attempts) { old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry); old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); if ((old_version && req->r_attempts >= old_max_retry) || ((uint32_t)req->r_attempts >= U32_MAX)) { pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", req->r_tid); return -EMULTIHOP; } } req->r_attempts++; if (req->r_inode) { struct ceph_cap *cap = ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); if (cap) req->r_sent_on_mseq = cap->mseq; else req->r_sent_on_mseq = -1; } doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. * Rebuilding paths will break for renames because * d_move mangles the src name. */ msg = req->r_request; lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); lhead->num_retry = req->r_attempts - 1; if (!old_version) { nhead = (struct ceph_mds_request_head*)msg->front.iov_base; nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); } /* remove cap/dentry releases from message */ lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return 0; } if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; } msg = create_request_message(session, req, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); return PTR_ERR(msg); } req->r_request = msg; lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; lhead->flags = cpu_to_le32(flags); lhead->num_fwd = req->r_num_fwd; lhead->num_retry = req->r_attempts - 1; if (!old_version) { nhead = (struct ceph_mds_request_head*)msg->front.iov_base; nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); } doutc(cl, " r_parent = %p\n", req->r_parent); return 0; } /* * called under mdsc->mutex */ static int __send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int err; err = __prepare_send_request(session, req, drop_cap_releases); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); } return err; } /* * send request, or put it on the appropriate wait list. */ static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session = NULL; int mds = -1; int err = 0; bool random; if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); return; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { doutc(cl, "metadata corrupted\n"); err = -EIO; goto finish; } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { doutc(cl, "timed out\n"); err = -ETIMEDOUT; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { doutc(cl, "forced umount\n"); err = -EIO; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; doutc(cl, "mdsmap err %d\n", err); goto finish; } if (mdsc->mdsmap->m_epoch == 0) { doutc(cl, "no mdsmap, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { err = -EHOSTUNREACH; goto finish; } } put_request_session(req); mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { err = -EJUKEBOX; goto finish; } doutc(cl, "no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } /* get, open session */ session = __ceph_lookup_mds_session(mdsc, mds); if (!session) { session = register_session(mdsc, mds); if (IS_ERR(session)) { err = PTR_ERR(session); goto finish; } } req->r_session = ceph_get_mds_session(session); doutc(cl, "mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); /* * The old ceph will crash the MDSs when see unknown OPs */ if (req->r_feature_needed > 0 && !test_bit(req->r_feature_needed, &session->s_features)) { err = -EOPNOTSUPP; goto out_session; } if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* * We cannot queue async requests since the caps and delegated * inodes are bound to the session. Just return -EJUKEBOX and * let the caller retry a sync request in that case. */ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { err = -EJUKEBOX; goto out_session; } /* * If the session has been REJECTED, then return a hard error, * unless it's a CLEANRECOVER mount, in which case we'll queue * it to the mdsc queue. */ if (session->s_state == CEPH_MDS_SESSION_REJECTED) { if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) list_add(&req->r_wait, &mdsc->waiting_for_map); else err = -EACCES; goto out_session; } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { err = __open_session(mdsc, session); if (err) goto out_session; /* retry the same mds later */ if (random) req->r_resend_mds = mds; } list_add(&req->r_wait, &session->s_waiting); goto out_session; } /* send request */ req->r_resend_mds = -1; /* forget any previous mds hint */ if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; /* * For async create we will choose the auth MDS of frag in parent * directory to send the request and ususally this works fine, but * if the migrated the dirtory to another MDS before it could handle * it the request will be forwarded. * * And then the auth cap will be changed. */ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); struct ceph_inode_info *ci; struct ceph_cap *cap; /* * The request maybe handled very fast and the new inode * hasn't been linked to the dentry yet. We need to wait * for the ceph_finish_async_create(), which shouldn't be * stuck too long or fail in thoery, to finish when forwarding * the request. */ if (!d_inode(req->r_dentry)) { err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, TASK_KILLABLE); if (err) { mutex_lock(&req->r_fill_mutex); set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); goto out_session; } } ci = ceph_inode(d_inode(req->r_dentry)); spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { doutc(cl, "session changed for auth cap %d -> %d\n", cap->session->s_mds, session->s_mds); /* Remove the auth cap from old session */ spin_lock(&cap->session->s_cap_lock); cap->session->s_nr_caps--; list_del_init(&cap->session_caps); spin_unlock(&cap->session->s_cap_lock); /* Add the auth cap to the new session */ cap->mds = mds; cap->session = session; spin_lock(&session->s_cap_lock); session->s_nr_caps++; list_add_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); change_auth_cap_ses(ci, session); } spin_unlock(&ci->i_ceph_lock); } err = __send_request(session, req, false); out_session: ceph_put_mds_session(session); finish: if (err) { doutc(cl, "early error %d\n", err); req->r_err = err; complete_request(mdsc, req); __unregister_request(mdsc, req); } return; } /* * called under mdsc->mutex */ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; LIST_HEAD(tmp_list); list_splice_init(head, &tmp_list); while (!list_empty(&tmp_list)) { req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); doutc(cl, " wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } /* * Wake up threads with requests pending for @mds, so that they can * resubmit their requests to a possibly different mds. */ static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p = rb_first(&mdsc->request_tree); doutc(cl, "kick_requests mds%d\n", mds); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts > 0) continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { doutc(cl, " kicking tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __do_request(mdsc, req); } } } int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; int err = 0; /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_parent) { struct ceph_inode_info *ci = ceph_inode(req->r_parent); int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); if (req->r_inode) { err = ceph_wait_on_async_create(req->r_inode); if (err) { doutc(cl, "wait for async create returned: %d\n", err); return err; } } if (!err && req->r_old_inode) { err = ceph_wait_on_async_create(req->r_old_inode); if (err) { doutc(cl, "wait for async create returned: %d\n", err); return err; } } doutc(cl, "submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); __do_request(mdsc, req); err = req->r_err; mutex_unlock(&mdsc->mutex); return err; } int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, ceph_mds_request_wait_callback_t wait_func) { struct ceph_client *cl = mdsc->fsc->client; int err; /* wait */ doutc(cl, "do_request waiting\n"); if (wait_func) { err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, ceph_timeout_jiffies(req->r_timeout)); if (timeleft > 0) err = 0; else if (!timeleft) err = -ETIMEDOUT; /* timed out */ else err = timeleft; /* killed */ } doutc(cl, "do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { doutc(cl, "aborted request %lld with %d\n", req->r_tid, err); /* * ensure we aren't running concurrently with * ceph_fill_trace or ceph_readdir_prepopulate, which * rely on locks (dir mutex) held by our caller. */ mutex_lock(&req->r_fill_mutex); req->r_err = err; set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { err = req->r_err; } mutex_unlock(&mdsc->mutex); return err; } /* * Synchrously perform an mds request. Take care of all of the * session setup, forwarding, retry details. */ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; int err; doutc(cl, "do_request on %p\n", req); /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) err = ceph_mdsc_wait_request(mdsc, req, NULL); doutc(cl, "do_request %p done, result %d\n", req, err); return err; } /* * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *dir = req->r_parent; struct inode *old_dir = req->r_old_dentry_dir; struct ceph_client *cl = req->r_mdsc->fsc->client; doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); ceph_dir_clear_complete(dir); if (old_dir) ceph_dir_clear_complete(old_dir); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) ceph_invalidate_dentry_lease(req->r_old_dentry); } /* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. * This preserves the logical ordering of replies, capabilities, etc., sent * by the MDS as they are applied to our local cache. */ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err_client(cl, "got corrupt (short) reply\n"); ceph_msg_dump(msg); return; } /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { doutc(cl, "on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); return; } doutc(cl, "handle_reply %p\n", req); /* correct session? */ if (req->r_session != session) { pr_err_client(cl, "got %llu on session mds%d not mds%d\n", tid, session->s_mds, req->r_session ? req->r_session->s_mds : -1); mutex_unlock(&mdsc->mutex); goto out; } /* dup? */ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } result = le32_to_cpu(head->result); if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); /* last request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) complete_all(&mdsc->safe_umount_waiters); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS * doesn't include any result info in the safe * response. And even if it did, there is nothing * useful we could do with a revised return value. */ doutc(cl, "got safe reply %llu, mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } } else { set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } doutc(cl, "tid %lld result %d\n", tid, result); if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) err = parse_reply_info(session, msg, req, (u64)-1); else err = parse_reply_info(session, msg, req, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); /* Must find target inode outside of mutexes to avoid deadlocks */ rinfo = &req->r_reply_info; if ((err >= 0) && rinfo->head->is_target) { struct inode *in = xchg(&req->r_new_inode, NULL); struct ceph_vino tvino = { .ino = le64_to_cpu(rinfo->targeti.in->ino), .snap = le64_to_cpu(rinfo->targeti.in->snapid) }; /* * If we ended up opening an existing inode, discard * r_new_inode */ if (req->r_op == CEPH_MDS_OP_CREATE && !req->r_reply_info.has_create_ino) { /* This should never happen on an async create */ WARN_ON_ONCE(req->r_deleg_ino); iput(in); in = NULL; } in = ceph_get_inode(mdsc->fsc->sb, tvino, in); if (IS_ERR(in)) { err = PTR_ERR(in); mutex_lock(&session->s_mutex); goto out_err; } req->r_target_inode = in; } mutex_lock(&session->s_mutex); if (err < 0) { pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } /* snap trace */ realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); if (err) { up_write(&mdsc->snap_rwsem); close_sessions = true; if (err == -EIO) ceph_msg_dump(msg); goto out_err; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); current->journal_info = req; err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) err = ceph_readdir_prepopulate(req, req->r_session); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); if (realm) ceph_put_snap_realm(mdsc, realm); if (err == 0) { if (req->r_target_inode && test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); spin_unlock(&ci->i_unsafe_lock); } ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } out_err: mutex_lock(&mdsc->mutex); if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { req->r_reply = ceph_msg_get(msg); set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { doutc(cl, "reply arrived after request %lld was aborted\n", tid); } mutex_unlock(&mdsc->mutex); mutex_unlock(&session->s_mutex); /* kick calling process */ complete_request(mdsc, req); ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, req->r_end_latency, err); out: ceph_mdsc_put_request(req); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; } /* * handle mds notification that our request has been forwarded. */ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; u64 tid = le64_to_cpu(msg->hdr.tid); u32 next_mds; u32 fwd_seq; int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { mutex_unlock(&mdsc->mutex); doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds); return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { doutc(cl, "forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* * Avoid inifinite retrying after overflow. * * The MDS will increase the fwd count and in client side * if the num_fwd is less than the one saved in request * that means the MDS is an old version and overflowed of * 8 bits. */ mutex_lock(&req->r_fill_mutex); req->r_err = -EMULTIHOP; set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); aborted = true; pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); } else { /* resend. forward race not possible; mds would drop */ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); __do_request(mdsc, req); } mutex_unlock(&mdsc->mutex); /* kick calling process */ if (aborted) complete_request(mdsc, req); ceph_mdsc_put_request(req); return; bad: pr_err_client(cl, "decode error err=%d\n", err); ceph_msg_dump(msg); } static int __decode_session_metadata(void **p, void *end, bool *blocklisted) { /* map<string,string> */ u32 n; bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); /* * Match "blocklisted (blacklisted)" from newer MDSes, * or "blacklisted" from older MDSes. */ if (err_str && strnstr(*p, "blacklisted", len)) *blocklisted = true; *p += len; } return 0; bad: return -1; } /* * handle a mds session control message */ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; int mds = session->s_mds; int msg_version = le16_to_cpu(msg->hdr.version); void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; struct ceph_mds_cap_auth *cap_auths = NULL; u32 op, cap_auths_num = 0; u64 seq, features = 0; int wake = 0; bool blocklisted = false; u32 i; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); h = p; p += sizeof(*h); op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); if (msg_version >= 3) { u32 len; /* version >= 2 and < 5, decode metadata, skip otherwise * as it's handled via flags. */ if (msg_version >= 5) ceph_decode_skip_map(&p, end, string, string, bad); else if (__decode_session_metadata(&p, end, &blocklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); if (len) { ceph_decode_64_safe(&p, end, features, bad); p += len - sizeof(features); } } if (msg_version >= 5) { u32 flags, len; /* version >= 4 */ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ ceph_decode_32_safe(&p, end, len, bad); /* len */ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ /* version >= 5, flags */ ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { pr_warn_client(cl, "mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } if (msg_version >= 6) { ceph_decode_32_safe(&p, end, cap_auths_num, bad); doutc(cl, "cap_auths_num %d\n", cap_auths_num); if (cap_auths_num && op != CEPH_SESSION_OPEN) { WARN_ON_ONCE(op != CEPH_SESSION_OPEN); goto skip_cap_auths; } cap_auths = kcalloc(cap_auths_num, sizeof(struct ceph_mds_cap_auth), GFP_KERNEL); if (!cap_auths) { pr_err_client(cl, "No memory for cap_auths\n"); return; } for (i = 0; i < cap_auths_num; i++) { u32 _len, j; /* struct_v, struct_compat, and struct_len in MDSCapAuth */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); /* struct_v, struct_compat, and struct_len in MDSCapMatch */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad); ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.gids = kcalloc(_len, sizeof(u32), GFP_KERNEL); if (!cap_auths[i].match.gids) { pr_err_client(cl, "No memory for gids\n"); goto fail; } cap_auths[i].match.num_gids = _len; for (j = 0; j < _len; j++) ceph_decode_32_safe(&p, end, cap_auths[i].match.gids[j], bad); } ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char), GFP_KERNEL); if (!cap_auths[i].match.path) { pr_err_client(cl, "No memory for path\n"); goto fail; } ceph_decode_copy(&p, cap_auths[i].match.path, _len); /* Remove the tailing '/' */ while (_len && cap_auths[i].match.path[_len - 1] == '/') { cap_auths[i].match.path[_len - 1] = '\0'; _len -= 1; } } ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char), GFP_KERNEL); if (!cap_auths[i].match.fs_name) { pr_err_client(cl, "No memory for fs_name\n"); goto fail; } ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len); } ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad); ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad); ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad); doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n", cap_auths[i].match.uid, cap_auths[i].match.num_gids, cap_auths[i].match.path, cap_auths[i].match.fs_name, cap_auths[i].match.root_squash, cap_auths[i].readable, cap_auths[i].writeable); } } skip_cap_auths: mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_OPEN) { if (mdsc->s_cap_auths) { for (i = 0; i < mdsc->s_cap_auths_num; i++) { kfree(mdsc->s_cap_auths[i].match.gids); kfree(mdsc->s_cap_auths[i].match.path); kfree(mdsc->s_cap_auths[i].match.fs_name); } kfree(mdsc->s_cap_auths); } mdsc->s_cap_auths_num = cap_auths_num; mdsc->s_cap_auths = cap_auths; } if (op == CEPH_SESSION_CLOSE) { ceph_get_mds_session(session); __unregister_session(mdsc, session); } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); doutc(cl, "mds%d %s %p state %s seq %llu\n", mds, ceph_session_op_name(op), session, ceph_session_state_name(session->s_state), seq); if (session->s_state == CEPH_MDS_SESSION_HUNG) { session->s_state = CEPH_MDS_SESSION_OPEN; pr_info_client(cl, "mds%d came back\n", session->s_mds); } switch (op) { case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) metric_schedule_delayed(&mdsc->metric); } /* * The connection maybe broken and the session in client * side has been reinitialized, need to update the seq * anyway. */ if (!session->s_seq && seq) session->s_seq = seq; wake = 1; if (mdsc->stopping) __close_session(mdsc, session); break; case CEPH_SESSION_RENEWCAPS: if (session->s_renew_seq == seq) renewed_caps(mdsc, session, 1); break; case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info_client(cl, "mds%d reconnect denied\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_CLOSED; cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); break; case CEPH_SESSION_STALE: pr_info_client(cl, "mds%d caps went stale, renewing\n", session->s_mds); atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; send_renew_caps(mdsc, session); break; case CEPH_SESSION_RECALL_STATE: ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; case CEPH_SESSION_FLUSHMSG: /* flush cap releases */ spin_lock(&session->s_cap_lock); if (session->s_num_cap_releases) ceph_flush_cap_releases(mdsc, session); spin_unlock(&session->s_cap_lock); send_flushmsg_ack(mdsc, session, seq); break; case CEPH_SESSION_FORCE_RO: doutc(cl, "force_session_readonly %p\n", session); spin_lock(&session->s_cap_lock); session->s_readonly = true; spin_unlock(&session->s_cap_lock); wake_up_session_caps(session, FORCE_RO); break; case CEPH_SESSION_REJECT: WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); pr_info_client(cl, "mds%d rejected session\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); if (blocklisted) mdsc->fsc->blocklisted = true; wake = 2; /* for good measure */ break; default: pr_err_client(cl, "bad op %d mds%d\n", op, mds); WARN_ON(1); } mutex_unlock(&session->s_mutex); if (wake) { mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); if (wake == 2) kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } if (op == CEPH_SESSION_CLOSE) ceph_put_mds_session(session); return; bad: pr_err_client(cl, "corrupt message mds%d len %d\n", mds, (int)msg->front.iov_len); ceph_msg_dump(msg); fail: for (i = 0; i < cap_auths_num; i++) { kfree(cap_auths[i].match.gids); kfree(cap_auths[i].match.path); kfree(cap_auths[i].match.fs_name); } kfree(cap_auths); return; } void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); } } void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } /* * called under session->mutex. */ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; struct rb_node *p; doutc(mdsc->fsc->client, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) __send_request(session, req, true); /* * also re-send old requests when MDS enters reconnect stage. So that MDS * can process completed request in clientreplay stage. */ p = rb_first(&mdsc->request_tree); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts == 0) continue; /* only old requests */ if (!req->r_session) continue; if (req->r_session->s_mds != session->s_mds) continue; ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } mutex_unlock(&mdsc->mutex); } static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) { struct ceph_msg *reply; struct ceph_pagelist *_pagelist; struct page *page; __le32 *addr; int err = -ENOMEM; if (!recon_state->allow_multi) return -ENOSPC; /* can't handle message that contains both caps and realm */ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); /* pre-allocate new pagelist */ _pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!_pagelist) return -ENOMEM; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_msg; /* placeholder for nr_caps */ err = ceph_pagelist_encode_32(_pagelist, 0); if (err < 0) goto fail; if (recon_state->nr_caps) { /* currently encoding caps */ err = ceph_pagelist_encode_32(recon_state->pagelist, 0); if (err) goto fail; } else { /* placeholder for nr_realms (currently encoding relams) */ err = ceph_pagelist_encode_32(_pagelist, 0); if (err < 0) goto fail; } err = ceph_pagelist_encode_8(recon_state->pagelist, 1); if (err) goto fail; page = list_first_entry(&recon_state->pagelist->head, struct page, lru); addr = kmap_atomic(page); if (recon_state->nr_caps) { /* currently encoding caps */ *addr = cpu_to_le32(recon_state->nr_caps); } else { /* currently encoding relams */ *(addr + 1) = cpu_to_le32(recon_state->nr_realms); } kunmap_atomic(addr); reply->hdr.version = cpu_to_le16(5); reply->hdr.compat_version = cpu_to_le16(4); reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); ceph_msg_data_add_pagelist(reply, recon_state->pagelist); ceph_con_send(&recon_state->session->s_con, reply); ceph_pagelist_release(recon_state->pagelist); recon_state->pagelist = _pagelist; recon_state->nr_caps = 0; recon_state->nr_realms = 0; recon_state->msg_version = 5; return 0; fail: ceph_msg_put(reply); fail_msg: ceph_pagelist_release(_pagelist); return err; } static struct dentry* d_find_primary(struct inode *inode) { struct dentry *alias, *dn = NULL; if (hlist_empty(&inode->i_dentry)) return NULL; spin_lock(&inode->i_lock); if (hlist_empty(&inode->i_dentry)) goto out_unlock; if (S_ISDIR(inode->i_mode)) { alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (!IS_ROOT(alias)) dn = dget(alias); goto out_unlock; } hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias) && (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { dn = dget_dlock(alias); } spin_unlock(&alias->d_lock); if (dn) break; } out_unlock: spin_unlock(&inode->i_lock); return dn; } /* * Encode information about a cap for a reconnect with the MDS. */ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; char *path; int pathlen = 0, err; u64 pathbase; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } } else { path = NULL; pathbase = 0; } spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { spin_unlock(&ci->i_ceph_lock); err = 0; goto out_err; } doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = atomic_read(&cap->session->s_cap_gen); /* These are lost when the session goes away */ if (S_ISDIR(inode->i_mode)) { if (cap->issued & CEPH_CAP_DIR_CREATE) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; } if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { struct timespec64 ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(i_size_read(inode)); ts = inode_get_mtime(inode); ceph_encode_timespec64(&rec.v1.mtime, &ts); ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } if (list_empty(&ci->i_cap_snaps)) { snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; } else { struct ceph_cap_snap *capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snap_follows = capsnap->follows; } spin_unlock(&ci->i_ceph_lock); if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks = NULL; size_t struct_len, total_len = sizeof(u64); u8 struct_v = 0; encode_again: if (rec.v2.flock_len) { ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); } else { num_fcntl_locks = 0; num_flock_locks = 0; } if (num_fcntl_locks + num_flock_locks > 0) { flocks = kmalloc_array(num_fcntl_locks + num_flock_locks, sizeof(struct ceph_filelock), GFP_NOFS); if (!flocks) { err = -ENOMEM; goto out_err; } err = ceph_encode_locks_to_buffer(inode, flocks, num_fcntl_locks, num_flock_locks); if (err) { kfree(flocks); flocks = NULL; if (err == -ENOSPC) goto encode_again; goto out_err; } } else { kfree(flocks); flocks = NULL; } if (recon_state->msg_version >= 3) { /* version, compat_version and struct_len */ total_len += 2 * sizeof(u8) + sizeof(u32); struct_v = 2; } /* * number of encoded locks is stable, so copy to pagelist */ struct_len = 2 * sizeof(u32) + (num_fcntl_locks + num_flock_locks) * sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ total_len += struct_len; if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { err = send_reconnect_partial(recon_state); if (err) goto out_freeflocks; pagelist = recon_state->pagelist; } err = ceph_pagelist_reserve(pagelist, total_len); if (err) goto out_freeflocks; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); if (recon_state->msg_version >= 3) { ceph_pagelist_encode_8(pagelist, struct_v); ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); if (struct_v >= 2) ceph_pagelist_encode_64(pagelist, snap_follows); out_freeflocks: kfree(flocks); } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: ceph_mdsc_free_path(path, pathlen); if (!err) recon_state->nr_caps++; return err; } static int encode_snap_realms(struct ceph_mds_client *mdsc, struct ceph_reconnect_state *recon_state) { struct rb_node *p; struct ceph_pagelist *pagelist = recon_state->pagelist; struct ceph_client *cl = mdsc->fsc->client; int err = 0; if (recon_state->msg_version >= 4) { err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); if (err < 0) goto fail; } /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, * it will tell us. */ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { struct ceph_snap_realm *realm = rb_entry(p, struct ceph_snap_realm, node); struct ceph_mds_snaprealm_reconnect sr_rec; if (recon_state->msg_version >= 4) { size_t need = sizeof(u8) * 2 + sizeof(u32) + sizeof(sr_rec); if (pagelist->length + need > RECONNECT_MAX_SIZE) { err = send_reconnect_partial(recon_state); if (err) goto fail; pagelist = recon_state->pagelist; } err = ceph_pagelist_reserve(pagelist, need); if (err) goto fail; ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); } doutc(cl, " adding snap realm %llx seq %lld parent %llx\n", realm->ino, realm->seq, realm->parent_ino); sr_rec.ino = cpu_to_le64(realm->ino); sr_rec.seq = cpu_to_le64(realm->seq); sr_rec.parent = cpu_to_le64(realm->parent_ino); err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); if (err) goto fail; recon_state->nr_realms++; } fail: return err; } /* * If an MDS fails and recovers, clients need to reconnect in order to * reestablish shared state. This includes all caps issued through * this session _and_ the snap_realm hierarchy. Because it's not * clear which snap realms the mds cares about, we send everything we * know about.. that ensures we'll then get any new info the * recovering MDS might have. * * This is a relatively heavyweight operation, but it's rare. */ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *reply; int mds = session->s_mds; int err = -ENOMEM; struct ceph_reconnect_state recon_state = { .session = session, }; LIST_HEAD(dispose); pr_info_client(cl, "mds%d reconnect start\n", mds); recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!recon_state.pagelist) goto fail_nopagelist; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_nomsg; xa_destroy(&session->s_delegated_inos); mutex_lock(&session->s_mutex); session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; doutc(cl, "session %p state %s\n", session, ceph_session_state_name(session->s_state)); atomic_inc(&session->s_cap_gen); spin_lock(&session->s_cap_lock); /* don't know if session is readonly */ session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, * __ceph_remove_cap() should skip queuing cap release. */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ detach_cap_releases(session, &dispose); spin_unlock(&session->s_cap_lock); dispose_cap_releases(mdsc, &dispose); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); ceph_early_kick_flushing_caps(mdsc, session); down_read(&mdsc->snap_rwsem); /* placeholder for nr_caps */ err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; recon_state.allow_multi = true; } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { recon_state.msg_version = 3; } else { recon_state.msg_version = 2; } /* trsaverse this session's caps */ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; spin_unlock(&session->s_cap_lock); if (err < 0) goto fail; /* check if all realms can be encoded into current message */ if (mdsc->num_snap_realms) { size_t total_len = recon_state.pagelist->length + mdsc->num_snap_realms * sizeof(struct ceph_mds_snaprealm_reconnect); if (recon_state.msg_version >= 4) { /* number of realms */ total_len += sizeof(u32); /* version, compat_version and struct_len */ total_len += mdsc->num_snap_realms * (2 * sizeof(u8) + sizeof(u32)); } if (total_len > RECONNECT_MAX_SIZE) { if (!recon_state.allow_multi) { err = -ENOSPC; goto fail; } if (recon_state.nr_caps) { err = send_reconnect_partial(&recon_state); if (err) goto fail; } recon_state.msg_version = 5; } } err = encode_snap_realms(mdsc, &recon_state); if (err < 0) goto fail; if (recon_state.msg_version >= 5) { err = ceph_pagelist_encode_8(recon_state.pagelist, 0); if (err < 0) goto fail; } if (recon_state.nr_caps || recon_state.nr_realms) { struct page *page = list_first_entry(&recon_state.pagelist->head, struct page, lru); __le32 *addr = kmap_atomic(page); if (recon_state.nr_caps) { WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); *addr = cpu_to_le32(recon_state.nr_caps); } else if (recon_state.msg_version >= 4) { *(addr + 1) = cpu_to_le32(recon_state.nr_realms); } kunmap_atomic(addr); } reply->hdr.version = cpu_to_le16(recon_state.msg_version); if (recon_state.msg_version >= 4) reply->hdr.compat_version = cpu_to_le16(4); reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); ceph_msg_data_add_pagelist(reply, recon_state.pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); ceph_pagelist_release(recon_state.pagelist); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: pr_err_client(cl, "error %d preparing reconnect for mds%d\n", err, mds); return; } /* * compare old and new mdsmaps, kicking requests * and closing out old connections as necessary * * called under mdsc->mutex. */ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); if (newmap->m_info) { for (i = 0; i < newmap->possible_max_rank; i++) { for (j = 0; j < newmap->m_info[i].num_export_targets; j++) set_bit(newmap->m_info[i].export_targets[j], targets); } } for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n", i, ceph_mds_state_name(oldstate), ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); if (i >= newmap->possible_max_rank) { /* force close session for stopped mds */ ceph_get_mds_session(s); __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); cleanup_session_requests(mdsc, s); remove_session_caps(s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); kick_requests(mdsc, i); continue; } if (memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { /* just close it */ mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); mutex_lock(&mdsc->mutex); ceph_con_close(&s->s_con); mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } /* * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } /* * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { if (oldstate != CEPH_MDS_STATE_CREATING && oldstate != CEPH_MDS_STATE_STARTING) pr_info_client(cl, "mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); mutex_lock(&mdsc->mutex); ceph_kick_flushing_caps(mdsc, s); mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); } } /* * Only open and reconnect sessions that don't exist yet. */ for (i = 0; i < newmap->possible_max_rank; i++) { /* * In case the import MDS is crashed just after * the EImportStart journal is flushed, so when * a standby MDS takes over it and is replaying * the EImportStart journal the new MDS daemon * will wait the client to reconnect it, but the * client may never register/open the session yet. * * Will try to reconnect that MDS daemon if the * rank number is in the export targets array and * is the up:reconnect state. */ newstate = ceph_mdsmap_get_state(newmap, i); if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) continue; /* * The session maybe registered and opened by some * requests which were choosing random MDSes during * the mdsc->mutex's unlock/lock gap below in rare * case. But the related MDS daemon will just queue * that requests and be still waiting for the client's * reconnection request in up:reconnect state. */ s = __ceph_lookup_mds_session(mdsc, i); if (likely(!s)) { s = __open_export_target_session(mdsc, i); if (IS_ERR(s)) { err = PTR_ERR(s); pr_err_client(cl, "failed to open export target session, err %d\n", err); continue; } } doutc(cl, "send reconnect to export target mds.%d\n", i); mutex_unlock(&mdsc->mutex); send_mds_reconnect(mdsc, s); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; if (!ceph_mdsmap_is_laggy(newmap, i)) continue; if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG || s->s_state == CEPH_MDS_SESSION_CLOSING) { doutc(cl, " connecting to export targets of laggy mds%d\n", i); __open_export_target_sessions(mdsc, s); } } } /* * leases */ /* * caller must hold session s_mutex, dentry->d_lock */ void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); ceph_put_mds_session(di->lease_session); di->lease_session = NULL; } static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; struct qstr dname; int release = 0; doutc(cl, "from mds%d\n", mds); if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); dname.len = get_unaligned_le32(h + 1); if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len) goto bad; dname.name = (void *)(h + 1) + sizeof(u32); /* lookup inode */ inode = ceph_find_inode(sb, vino); doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); mutex_lock(&session->s_mutex); if (!inode) { doutc(cl, "no inode %llx\n", vino.ino); goto release; } /* dentry */ parent = d_find_alias(inode); if (!parent) { doutc(cl, "no parent dentry on inode %p\n", inode); WARN_ON(1); goto release; /* hrm... */ } dname.hash = full_name_hash(parent, dname.name, dname.len); dentry = d_lookup(parent, &dname); dput(parent); if (!dentry) goto release; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); } release = 1; break; case CEPH_MDS_LEASE_RENEW: if (di->lease_session == session && di->lease_gen == atomic_read(&session->s_cap_gen) && di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; di->time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); di->lease_renew_from = 0; } break; } spin_unlock(&dentry->d_lock); dput(dentry); if (!release) goto out; release: /* let's just reuse the same message */ h->action = CEPH_MDS_LEASE_REVOKE_ACK; ceph_msg_get(msg); ceph_con_send(&session->s_con, msg); out: mutex_unlock(&session->s_mutex); iput(inode); ceph_dec_mds_stopping_blocker(mdsc); return; bad: ceph_dec_mds_stopping_blocker(mdsc); pr_err_client(cl, "corrupt lease message\n"); ceph_msg_dump(msg); } void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_lease *lease; struct inode *dir; int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action), session->s_mds); msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; lease->seq = cpu_to_le32(seq); spin_lock(&dentry->d_lock); dir = d_inode(dentry->d_parent); lease->ino = cpu_to_le64(ceph_ino(dir)); lease->first = lease->last = cpu_to_le64(ceph_snap(dir)); put_unaligned_le32(dentry->d_name.len, lease + 1); memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); ceph_con_send(&session->s_con, msg); } /* * lock unlock the session, to wait ongoing session activities */ static void lock_unlock_session(struct ceph_mds_session *s) { mutex_lock(&s->s_mutex); mutex_unlock(&s->s_mutex); } static void maybe_recover_session(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_fs_client *fsc = mdsc->fsc; if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) return; if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) return; if (!READ_ONCE(fsc->blocklisted)) return; pr_info_client(cl, "auto reconnect after blocklisted\n"); ceph_force_reconnect(fsc->sb); } bool check_session_state(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; pr_info_client(cl, "mds%d hung\n", s->s_mds); } break; case CEPH_MDS_SESSION_CLOSING: case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: case CEPH_MDS_SESSION_REJECTED: return false; } return true; } /* * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, * then we need to retransmit that request. */ void inc_session_sequence(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; lockdep_assert_held(&s->s_mutex); s->s_seq++; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { int ret; doutc(cl, "resending session close request for mds%d\n", s->s_mds); ret = request_close_session(s); if (ret < 0) pr_err_client(cl, "unable to close session to mds%d: %d\n", s->s_mds, ret); } } /* * delayed work -- periodically trim expired leases, renew caps with mds. If * the @delay parameter is set to 0 or if it's more than 5 secs, the default * workqueue delay value of 5 secs will be used. */ static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay) { unsigned long max_delay = HZ * 5; /* 5 secs default delay */ if (!delay || (delay > max_delay)) delay = max_delay; schedule_delayed_work(&mdsc->delayed_work, round_jiffies_relative(delay)); } static void delayed_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, delayed_work.work); unsigned long delay; int renew_interval; int renew_caps; int i; doutc(mdsc->fsc->client, "mdsc delayed_work\n"); if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) return; mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; renew_caps = time_after_eq(jiffies, HZ*renew_interval + mdsc->last_renew_caps); if (renew_caps) mdsc->last_renew_caps = jiffies; for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; if (!check_session_state(s)) { ceph_put_mds_session(s); continue; } mutex_unlock(&mdsc->mutex); ceph_flush_cap_releases(mdsc, s); mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); delay = ceph_check_delayed_caps(mdsc); ceph_queue_cap_reclaim_work(mdsc); ceph_trim_snapid_map(mdsc); maybe_recover_session(mdsc); schedule_delayed(mdsc, delay); } int ceph_mdsc_init(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc; int err; mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); if (!mdsc) return -ENOMEM; mdsc->fsc = fsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (!mdsc->mdsmap) { err = -ENOMEM; goto err_mdsc; } init_completion(&mdsc->safe_umount_waiters); spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; mutex_init(&mdsc->quotarealms_inodes_mutex); init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); #ifdef CONFIG_DEBUG_FS INIT_LIST_HEAD(&mdsc->cap_wait_list); #endif spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; spin_lock_init(&mdsc->dentry_list_lock); INIT_LIST_HEAD(&mdsc->dentry_leases); INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); ceph_adjust_caps_max_min(mdsc, fsc->mount_options); spin_lock_init(&mdsc->snapid_map_lock); mdsc->snapid_map_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->snapid_map_lru); init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); fsc->mdsc = mdsc; return 0; err_mdsmap: kfree(mdsc->mdsmap); err_mdsc: kfree(mdsc); return err; } /* * Wait for safe replies on open mds requests. If we time out, drop * all requests from the tree to avoid dangling dentry refs. */ static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { mutex_unlock(&mdsc->mutex); doutc(cl, "waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); while ((req = __get_oldest_req(mdsc))) { doutc(cl, "timed out on tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __unregister_request(mdsc, req); } } mutex_unlock(&mdsc->mutex); doutc(cl, "done\n"); } void send_flush_mdlog(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; struct ceph_msg *msg; /* * Pre-luminous MDS crashes when it sees an unknown session request */ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) return; mutex_lock(&s->s_mutex); doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, s->s_seq); if (!msg) { pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n", s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); } else { ceph_con_send(&s->s_con, msg); } mutex_unlock(&s->s_mutex); } static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, struct ceph_mds_cap_auth *auth, char *tpath) { const struct cred *cred = get_current_cred(); u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_client *cl = mdsc->fsc->client; const char *spath = mdsc->fsc->mount_options->server_path; bool gid_matched = false; u32 gid, tlen, len; int i, j; doutc(cl, "match.uid %lld\n", auth->match.uid); if (auth->match.uid != MDS_AUTH_UID_ANY) { if (auth->match.uid != caller_uid) return 0; if (auth->match.num_gids) { for (i = 0; i < auth->match.num_gids; i++) { if (caller_gid == auth->match.gids[i]) gid_matched = true; } if (!gid_matched && cred->group_info->ngroups) { for (i = 0; i < cred->group_info->ngroups; i++) { gid = from_kgid(&init_user_ns, cred->group_info->gid[i]); for (j = 0; j < auth->match.num_gids; j++) { if (gid == auth->match.gids[j]) { gid_matched = true; break; } } if (gid_matched) break; } } if (!gid_matched) return 0; } } /* path match */ if (auth->match.path) { if (!tpath) return 0; tlen = strlen(tpath); len = strlen(auth->match.path); if (len) { char *_tpath = tpath; bool free_tpath = false; int m, n; doutc(cl, "server path %s, tpath %s, match.path %s\n", spath, tpath, auth->match.path); if (spath && (m = strlen(spath)) != 1) { /* mount path + '/' + tpath + an extra space */ n = m + 1 + tlen + 1; _tpath = kmalloc(n, GFP_NOFS); if (!_tpath) return -ENOMEM; /* remove the leading '/' */ snprintf(_tpath, n, "%s/%s", spath + 1, tpath); free_tpath = true; tlen = strlen(_tpath); } /* * Please note the tailing '/' for match.path has already * been removed when parsing. * * Remove the tailing '/' for the target path. */ while (tlen && _tpath[tlen - 1] == '/') { _tpath[tlen - 1] = '\0'; tlen -= 1; } doutc(cl, "_tpath %s\n", _tpath); /* * In case first == _tpath && tlen == len: * match.path=/foo --> /foo _path=/foo --> match * match.path=/foo/ --> /foo _path=/foo --> match * * In case first == _tmatch.path && tlen > len: * match.path=/foo/ --> /foo _path=/foo/ --> match * match.path=/foo --> /foo _path=/foo/ --> match * match.path=/foo/ --> /foo _path=/foo/d --> match * match.path=/foo --> /foo _path=/food --> mismatch * * All the other cases --> mismatch */ char *first = strstr(_tpath, auth->match.path); if (first != _tpath) { if (free_tpath) kfree(_tpath); return 0; } if (tlen > len && _tpath[len] != '/') { if (free_tpath) kfree(_tpath); return 0; } } } doutc(cl, "matched\n"); return 1; } int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) { const struct cred *cred = get_current_cred(); u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_mds_cap_auth *rw_perms_s = NULL; struct ceph_client *cl = mdsc->fsc->client; bool root_squash_perms = true; int i, err; doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n", tpath, mask, caller_uid, caller_gid); for (i = 0; i < mdsc->s_cap_auths_num; i++) { struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i]; err = ceph_mds_auth_match(mdsc, s, tpath); if (err < 0) { return err; } else if (err > 0) { /* always follow the last auth caps' permision */ root_squash_perms = true; rw_perms_s = NULL; if ((mask & MAY_WRITE) && s->writeable && s->match.root_squash && (!caller_uid || !caller_gid)) root_squash_perms = false; if (((mask & MAY_WRITE) && !s->writeable) || ((mask & MAY_READ) && !s->readable)) rw_perms_s = s; } } doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms, rw_perms_s); if (root_squash_perms && rw_perms_s == NULL) { doutc(cl, "access allowed\n"); return 0; } if (!root_squash_perms) { doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write", caller_uid, caller_gid); } if (rw_perms_s) { doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d", rw_perms_s->readable, rw_perms_s->writeable, !!(mask & MAY_READ), !!(mask & MAY_WRITE)); } doutc(cl, "access denied\n"); return -EACCES; } /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) */ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { doutc(mdsc->fsc->client, "begin\n"); mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); /* * wait for reply handlers to drop their request refs and * their inode/dcache refs */ ceph_msgr_flush(); ceph_cleanup_quotarealms_inodes(mdsc); doutc(mdsc->fsc->client, "done\n"); } /* * flush the mdlog and wait for all write mds requests to flush. */ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req = NULL, *nextreq; struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); doutc(cl, "want %lld\n", want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { /* find next request */ n = rb_next(&req->r_node); if (n) nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { struct ceph_mds_session *s = req->r_session; if (!s) { req = nextreq; continue; } /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDS */ if (last_session != s) { send_flush_mdlog(s); ceph_put_mds_session(last_session); last_session = s; } else { ceph_put_mds_session(s); } doutc(cl, "wait on %llu (want %llu)\n", req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) break; /* next dne before, so we're done! */ if (RB_EMPTY_NODE(&nextreq->r_node)) { /* next request was removed from tree */ ceph_mdsc_put_request(nextreq); goto restart; } ceph_mdsc_put_request(nextreq); /* won't go away */ } req = nextreq; } mutex_unlock(&mdsc->mutex); ceph_put_mds_session(last_session); doutc(cl, "done\n"); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; u64 want_tid, want_flush; if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; doutc(cl, "sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; mutex_unlock(&mdsc->mutex); ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); cf->wake = true; } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush); flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } /* * true if all sessions are closed, or we force unmount */ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) <= skipped; } /* * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session; int i; int skipped = 0; doutc(cl, "begin\n"); /* close sessions */ mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { session = __ceph_lookup_mds_session(mdsc, i); if (!session) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); if (__close_session(mdsc, session) <= 0) skipped++; mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); doutc(cl, "waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc, skipped), ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = ceph_get_mds_session(mdsc->sessions[i]); __unregister_session(mdsc, session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); remove_session_caps(session); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } } WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ doutc(cl, "done\n"); } void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int mds; doutc(mdsc->fsc->client, "force umount\n"); mutex_lock(&mdsc->mutex); for (mds = 0; mds < mdsc->max_sessions; mds++) { session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; if (session->s_state == CEPH_MDS_SESSION_REJECTED) __unregister_session(mdsc, session); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { cleanup_session_requests(mdsc, session); remove_session_caps(session); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); } static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { doutc(mdsc->fsc->client, "stop\n"); /* * Make sure the delayed work stopped before releasing * the resources. * * Because the cancel_delayed_work_sync() will only * guarantee that the work finishes executing. But the * delayed work will re-arm itself again after that. */ flush_delayed_work(&mdsc->delayed_work); if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc = fsc->mdsc; doutc(fsc->client, "%p\n", mdsc); if (!mdsc) return; /* flush out any connection work with references to us */ ceph_msgr_flush(); ceph_mdsc_stop(mdsc); ceph_metric_destroy(&mdsc->metric); fsc->mdsc = NULL; kfree(mdsc); doutc(fsc->client, "%p done\n", mdsc); } void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct ceph_fs_client *fsc = mdsc->fsc; struct ceph_client *cl = fsc->client; const char *mds_namespace = fsc->mount_options->mds_namespace; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; u32 epoch; u32 num_fs; u32 mount_fscid = (u32)-1; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(u32), bad); epoch = ceph_decode_32(&p); doutc(cl, "epoch %u\n", epoch); /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); ceph_decode_32_safe(&p, end, num_fs, bad); while (num_fs-- > 0) { void *info_p, *info_end; u32 info_len; u32 fscid, namelen; ceph_decode_need(&p, end, 2 + sizeof(u32), bad); p += 2; // info_v, info_cv info_len = ceph_decode_32(&p); ceph_decode_need(&p, end, info_len, bad); info_p = p; info_end = p + info_len; p = info_end; ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); fscid = ceph_decode_32(&info_p); namelen = ceph_decode_32(&info_p); ceph_decode_need(&info_p, info_end, namelen, bad); if (mds_namespace && strlen(mds_namespace) == namelen && !strncmp(mds_namespace, (char *)info_p, namelen)) { mount_fscid = fscid; break; } } ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); if (mount_fscid != (u32)-1) { fsc->client->monc.fs_cluster_id = mount_fscid; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); ceph_monc_renew_subs(&fsc->client->monc); } else { err = -ENOENT; goto err_out; } return; bad: pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); err_out: mutex_lock(&mdsc->mutex); mdsc->mdsmap_err = err; __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); } /* * handle mds map update. */ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; u32 epoch; u32 maplen; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; struct ceph_mdsmap *newmap, *oldmap; struct ceph_fsid fsid; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); doutc(cl, "epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); return; } newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; } /* swap into place */ if (mdsc->mdsmap) { oldmap = mdsc->mdsmap; mdsc->mdsmap = newmap; check_new_map(mdsc, newmap, oldmap); ceph_mdsmap_destroy(oldmap); } else { mdsc->mdsmap = newmap; /* first mds map */ } mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size, MAX_LFS_FILESIZE); __wake_requests(mdsc, &mdsc->waiting_for_map); ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); schedule_delayed(mdsc, 0); return; bad_unlock: mutex_unlock(&mdsc->mutex); bad: pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); return; } static struct ceph_connection *mds_get_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; if (ceph_get_mds_session(s)) return con; return NULL; } static void mds_put_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; ceph_put_mds_session(s); } /* * if the client is unresponsive for long enough, the mds will kill * the session entirely. */ static void mds_peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; int type = le16_to_cpu(msg->hdr.type); mutex_lock(&mdsc->mutex); if (__verify_registered_session(mdsc, s) < 0) { mutex_unlock(&mdsc->mutex); goto out; } mutex_unlock(&mdsc->mutex); switch (type) { case CEPH_MSG_MDS_MAP: ceph_mdsc_handle_mdsmap(mdsc, msg); break; case CEPH_MSG_FS_MAP_USER: ceph_mdsc_handle_fsmap(mdsc, msg); break; case CEPH_MSG_CLIENT_SESSION: handle_session(s, msg); break; case CEPH_MSG_CLIENT_REPLY: handle_reply(s, msg); break; case CEPH_MSG_CLIENT_REQUEST_FORWARD: handle_forward(mdsc, s, msg); break; case CEPH_MSG_CLIENT_CAPS: ceph_handle_caps(s, msg); break; case CEPH_MSG_CLIENT_SNAP: ceph_handle_snap(mdsc, s, msg); break; case CEPH_MSG_CLIENT_LEASE: handle_lease(mdsc, s, msg); break; case CEPH_MSG_CLIENT_QUOTA: ceph_handle_quota(mdsc, s, msg); break; default: pr_err_client(cl, "received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } out: ceph_msg_put(msg); } /* * authentication */ /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ static struct ceph_auth_handshake * mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, force_new, proto, NULL, NULL); if (ret) return ERR_PTR(ret); return auth; } static int mds_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, challenge_buf, challenge_buf_len); } static int mds_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, NULL, NULL, NULL, NULL); } static int mds_invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } static int mds_get_auth_request(struct ceph_connection *con, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int mds_handle_auth_reply_more(struct ceph_connection *con, void *reply, int reply_len, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int mds_handle_auth_done(struct ceph_connection *con, u64 global_id, void *reply, int reply_len, u8 *session_key, int *session_key_len, u8 *con_secret, int *con_secret_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, session_key, session_key_len, con_secret, con_secret_len); } static int mds_handle_auth_bad_method(struct ceph_connection *con, int used_proto, int result, const int *allowed_protos, int proto_cnt, const int *allowed_modes, int mode_cnt) { struct ceph_mds_session *s = con->private; struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc; int ret; if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS, used_proto, result, allowed_protos, proto_cnt, allowed_modes, mode_cnt)) { ret = ceph_monc_validate_auth(monc); if (ret) return ret; } return -EACCES; } static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_msg *msg; int type = (int) le16_to_cpu(hdr->type); int front_len = (int) le32_to_cpu(hdr->front_len); if (con->in_msg) return con->in_msg; *skip = 0; msg = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); return NULL; } return msg; } static int mds_sign_message(struct ceph_msg *msg) { struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_sign_message(auth, msg); } static int mds_check_message_signature(struct ceph_msg *msg) { struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_check_message_signature(auth, msg); } static const struct ceph_connection_operations mds_con_ops = { .get = mds_get_con, .put = mds_put_con, .alloc_msg = mds_alloc_msg, .dispatch = mds_dispatch, .peer_reset = mds_peer_reset, .get_authorizer = mds_get_authorizer, .add_authorizer_challenge = mds_add_authorizer_challenge, .verify_authorizer_reply = mds_verify_authorizer_reply, .invalidate_authorizer = mds_invalidate_authorizer, .sign_message = mds_sign_message, .check_message_signature = mds_check_message_signature, .get_auth_request = mds_get_auth_request, .handle_auth_reply_more = mds_handle_auth_reply_more, .handle_auth_done = mds_handle_auth_done, .handle_auth_bad_method = mds_handle_auth_bad_method, }; /* eof */ |
14 13 5 5 8 3 5 1 3 3 5 4 1 2 1 20 1 19 1 22 2 17 2 2 13 10 2 1 11 1 5 2 3 3 2 2 5 5 5 5 5 9 12 9 1 1 10 8 2 7 8 1 7 10 10 12 2 10 10 9 1 2 2 1 5 1 1 1 1 1 1 1 12 12 1 3 3 5 5 1 1 17 3 3 3 3 5 4 3 1 1 1 5 5 6 6 6 6 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/jhash.h> #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/ethtool.h> #include <linux/wait.h> #include <asm/div64.h> #include <linux/highmem.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> #include <net/genetlink.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_cls.h> #include "datapath.h" #include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" #include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" unsigned int ovs_net_id __read_mostly; static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; static const struct nla_policy flow_policy[]; static const struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP, }; static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { .name = OVS_DATAPATH_MCGROUP, }; static const struct genl_multicast_group ovs_dp_vport_multicast_group = { .name = OVS_VPORT_MCGROUP, }; /* Check if need to build a reply message. * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || genl_has_listeners(family, genl_info_net(info), group); } static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { genl_notify(family, skb, info, 0, GFP_KERNEL); } /** * DOC: Locking: * * All writes e.g. Writes to device state (add/remove datapath, port, set * operations on vports, etc.), Writes to other state (flow table * modifications, set miscellaneous datapath parameters, etc.) are protected * by ovs_lock. * * Reads are protected by RCU. * * There are a few special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. * * The RTNL lock nests inside ovs_mutex. */ static DEFINE_MUTEX(ovs_mutex); void ovs_lock(void) { mutex_lock(&ovs_mutex); } void ovs_unlock(void) { mutex_unlock(&ovs_mutex); } #ifdef CONFIG_LOCKDEP int lockdep_ovsl_is_held(void) { if (debug_locks) return lockdep_is_held(&ovs_mutex); else return 1; } #endif static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static void ovs_dp_masks_rebalance(struct work_struct *work); static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; rcu_read_lock(); local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) ifindex = local->dev->ifindex; else ifindex = 0; rcu_read_unlock(); return ifindex; } static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } static struct hlist_head *vport_hash_bucket(const struct datapath *dp, u16 port_no) { return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } /* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node, lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } return NULL; } /* Called with ovs_mutex. */ static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_add(parms); if (!IS_ERR(vport)) { struct datapath *dp = parms->dp; struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); hlist_add_head_rcu(&vport->dp_hash_node, head); } return vport; } static void ovs_vport_update_upcall_stats(struct sk_buff *skb, const struct dp_upcall_info *upcall_info, bool upcall_result) { struct vport *p = OVS_CB(skb)->input_vport; struct vport_upcall_stats_percpu *stats; if (upcall_info->cmd != OVS_PACKET_CMD_MISS && upcall_info->cmd != OVS_PACKET_CMD_ACTION) return; stats = this_cpu_ptr(p->upcall_stats); u64_stats_update_begin(&stats->syncp); if (upcall_result) u64_stats_inc(&stats->n_success); else u64_stats_inc(&stats->n_fail); u64_stats_update_end(&stats->syncp); } void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); /* Then destroy it. */ ovs_vport_del(p); } /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); else upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); switch (error) { case 0: case -EAGAIN: case -ERESTARTSYS: case -EINTR: consume_skb(skb); break; default: kfree_skb(skb); break; } stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct dp_stats_percpu *stats; int err; if (trace_ovs_dp_upcall_enabled()) trace_ovs_dp_upcall(dp, skb, key, upcall_info); if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; } if (!skb_is_gso(skb)) err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; return 0; err: stats = this_cpu_ptr(dp->stats_percpu); u64_stats_update_begin(&stats->syncp); stats->n_lost++; u64_stats_update_end(&stats->syncp); return err; } static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; if (gso_type & SKB_GSO_UDP) { /* The initial flow key extracted by ovs_flow_key_extract() * in this case is for a first fragment, so we need to * properly mark later fragments. */ later_key = *key; later_key.ip.frag = OVS_FRAG_TYPE_LATER; } /* Queue all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; } /* Free all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); } return err; } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) size += NLA_ALIGN(upcall_info->userdata->nla_len); /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) size += nla_total_size(sizeof(upcall_info->mru)); return size; } static void pad_packet(struct datapath *dp, struct sk_buff *skb) { if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { size_t plen = NLA_ALIGN(skb->len) - skb->len; if (plen > 0) skb_put_zero(skb, plen); } } static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; size_t len; unsigned int hlen; int err, dp_ifindex; u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) return -ENODEV; if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; skb = nskb; } if (nla_attr_size(skb->len) > USHRT_MAX) { err = -EFBIG; goto out; } /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_csum_hwoffload_help(skb, 0))) goto out; /* Older versions of OVS user space enforce alignment of the last * Netlink attribute to NLA_ALIGNTO which would require extensive * padding logic. Only perform zerocopy if padding is not required. */ if (dp->user_features & OVS_DP_F_UNALIGNED) hlen = skb_zerocopy_headlen(skb); else hlen = skb->len; len = upcall_msg_size(upcall_info, hlen - cutlen, OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; } upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); if (!upcall) { err = -EINVAL; goto out; } upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); if (err) goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); if (err) goto out; nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_actions(upcall_info->actions, upcall_info->actions_len, user_skb); if (!err) nla_nest_end(user_skb, nla); else nla_nest_cancel(user_skb, nla); } /* Add OVS_PACKET_ATTR_MRU */ if (upcall_info->mru && nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ if (cutlen > 0 && nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_HASH */ hash = skb_get_hash_raw(skb); if (skb->sw_hash) hash |= OVS_PACKET_HASH_SW_BIT; if (skb->l4_hash) hash |= OVS_PACKET_HASH_L4_BIT; if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { err = -ENOBUFS; goto out; } /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } nla->nla_len = nla_attr_size(skb->len - cutlen); err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); user_skb = NULL; out: if (err) skb_tx_error(skb); consume_skb(user_skb); consume_skb(nskb); return err; } static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct datapath *dp; struct vport *input_vport; u16 mru = 0; u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || !a[OVS_PACKET_ATTR_ACTIONS]) goto err; len = nla_len(a[OVS_PACKET_ATTR_PACKET]); packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); err = -ENOMEM; if (!packet) goto err; skb_reserve(packet, NET_IP_ALIGN); nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); /* Set packet's mru */ if (a[OVS_PACKET_ATTR_MRU]) { mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); packet->ignore_df = 1; } OVS_CB(packet)->mru = mru; if (a[OVS_PACKET_ATTR_HASH]) { hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); __skb_set_hash(packet, hash & 0xFFFFFFFFULL, !!(hash & OVS_PACKET_HASH_SW_BIT), !!(hash & OVS_PACKET_HASH_L4_BIT)); } /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], packet, &flow->key, log); if (err) goto err_flow_free; err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); if (!input_vport) input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); if (!input_vport) goto err_unlock; packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: return err; } static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, }; static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_packet_cmd_execute } }; static struct genl_family dp_packet_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, .policy = packet_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_packet_genl_ops, .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, .module = THIS_MODULE, }; static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { int i; memset(mega_stats, 0, sizeof(*mega_stats)); stats->n_flows = ovs_flow_tbl_count(&dp->table); mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; unsigned int start; percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; mega_stats->n_cache_hit += local_stats.n_cache_hit; } } static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) { return ovs_identifier_is_ufid(sfid) && !(ufid_flags & OVS_UFID_F_OMIT_KEY); } static bool should_fill_mask(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_MASK); } static bool should_fill_actions(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); } static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, uint32_t ufid_flags) { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback * see ovs_nla_put_identifier() */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); else len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_MASK */ if (should_fill_mask(ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) len += nla_total_size(acts->orig_len); return len + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, struct sk_buff *skb) { struct ovs_flow_stats stats; __be16 tcp_flags; unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if (stats.n_packets && nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats, OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) return -EMSGSIZE; return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, struct sk_buff *skb, int skb_orig_len) { struct nlattr *start; int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for * Netlink but individual action lists can be longer than * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. * The userspace caller can always fetch the actions separately if it * really wants them. (Most userspace callers in fact don't care.) * * This can only fail for dump operations because the skb is always * properly sized for single flows. */ start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { const struct sw_flow_actions *sf_acts; sf_acts = rcu_dereference_ovsl(flow->sf_acts); err = ovs_nla_put_actions(sf_acts->actions, sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); else { if (skb_orig_len) return err; nla_nest_cancel(skb, start); } } else if (skb_orig_len) { return -EMSGSIZE; } return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd, u32 ufid_flags) { const int skb_orig_len = skb->len; struct ovs_header *ovs_header; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = dp_ifindex; err = ovs_nla_put_identifier(flow, skb); if (err) goto error; if (should_fill_key(&flow->id, ufid_flags)) { err = ovs_nla_put_masked_key(flow, skb); if (err) goto error; } if (should_fill_mask(ufid_flags)) { err = ovs_nla_put_mask(flow, skb); if (err) goto error; } err = ovs_flow_cmd_fill_stats(flow, skb); if (err) goto error; if (should_fill_actions(ufid_flags)) { err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); if (err) goto error; } genlmsg_end(skb, ovs_header); return 0; error: genlmsg_cancel(skb, ovs_header); return err; } /* May not be called with RCU read lock. */ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, struct genl_info *info, bool always, uint32_t ufid_flags) { struct sk_buff *skb; size_t len; if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); return skb; } /* Called with ovs_mutex. */ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, int dp_ifindex, struct genl_info *info, u8 cmd, bool always, u32 ufid_flags) { struct sk_buff *skb; int retval; skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), &flow->id, info, always, ufid_flags); if (IS_ERR_OR_NULL(skb)) return skb; retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); if (WARN_ON_ONCE(retval < 0)) { kfree_skb(skb); skb = ERR_PTR(retval); } return skb; } static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Must have key and actions. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; } if (!a[OVS_FLOW_ATTR_ACTIONS]) { OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; } /* Most of the time we need to allocate a new flow, do it before * locking. */ new_flow = ovs_flow_alloc(); if (IS_ERR(new_flow)) { error = PTR_ERR(new_flow); goto error; } /* Extract key. */ key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check if this is a duplicate flow */ if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); /* Put flow in bucket. */ error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); if (unlikely(error)) { acts = NULL; goto err_unlock_ovs; } if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(new_flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); } else { struct sw_flow_actions *old_acts; /* Bail out if we're not allowed to modify an existing flow. * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL * because Generic Netlink treats the latter as a dump * request. We also accept NLM_F_EXCL in case that bug ever * gets fixed. */ if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))) { error = -EEXIST; goto err_unlock_ovs; } /* The flow identifier has to be the same for flow updates. * Look for any overlapping flow. */ if (unlikely(!ovs_flow_cmp(flow, &match))) { if (ovs_identifier_is_key(&flow->id)) flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); else /* UFID matches but key is different */ flow = NULL; if (!flow) { error = -ENOENT; goto err_unlock_ovs; } } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } if (reply) ovs_notify(&dp_flow_genl_family, reply, info); kfree(key); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); err_kfree_key: kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: return error; } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; int error; ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); return ERR_PTR(error); } return acts; } /* Factor out match-init and action-copy to avoid * "Wframe-larger-than=1024" warning. Because mask is only * used to get actions, we new a function to save some * stack space. * * If there are not key and action attrs, we return 0 * directly. In the case, the caller will also not use the * match as before. If there is action attr, we try to get * actions and save them to *acts. Before returning from * the function, we reset the match->mask pointer. Because * we should not to return match object with dangling reference * to mask. * */ static noinline_for_stack int ovs_nla_init_match_and_action(struct net *net, struct sw_flow_match *match, struct sw_flow_key *key, struct nlattr **a, struct sw_flow_actions **acts, bool log) { struct sw_flow_mask mask; int error = 0; if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(match, key, true, &mask); error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; } if (a[OVS_FLOW_ATTR_ACTIONS]) { if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attribute not present in set flow."); error = -EINVAL; goto error; } *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, &mask, log); if (IS_ERR(*acts)) { error = PTR_ERR(*acts); goto error; } } /* On success, error is 0. */ error: match->mask = NULL; return error; } static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; struct datapath *dp; struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) { OVS_NLERR(log, "Flow set message rejected, Key attribute missing."); return -EINVAL; } error = ovs_nla_init_match_and_action(net, &match, &key, a, &acts, log); if (error) goto error; if (acts) { /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check that the flow exists. */ if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; } /* Update actions, if present. */ if (likely(acts)) { old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_SET, ufid_flags); BUG_ON(error < 0); } } else { /* Could not alloc without acts before locking. */ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_SET, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_unlock_ovs; } } /* Clear stats. */ if (a[OVS_FLOW_ATTR_CLEAR]) ovs_flow_stats_clear(flow); ovs_unlock(); if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) ovs_nla_free_flow_actions_rcu(old_acts); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); error: return error; } static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, "Flow get message rejected, Key attribute missing."); err = -EINVAL; } if (err) return err; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_GET, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; } ovs_unlock(); return genlmsg_reply(reply, info); unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) return err; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (unlikely(!dp)) { err = -ENODEV; goto unlock; } if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { err = ovs_flow_tbl_flush(&dp->table); goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { err = -ENOENT; goto unlock; } ovs_flow_tbl_remove(&dp->table, flow); ovs_unlock(); reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); if (WARN_ON_ONCE(err < 0)) { kfree_skb(reply); goto out_free; } ovs_notify(&dp_flow_genl_family, reply, info); } else { netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); } } out_free: ovs_flow_free(flow, true); return 0; unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *a[__OVS_FLOW_ATTR_MAX]; struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct table_instance *ti; struct datapath *dp; u32 ufid_flags; int err; err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_FLOW_CMD_GET, ufid_flags) < 0) break; cb->args[0] = bucket; cb->args[1] = obj; } rcu_read_unlock(); return skb->len; } static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_set, }, }; static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, .policy = flow_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_flow_genl_ops, .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), .resv_start_op = OVS_FLOW_CMD_SET + 1, .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) { size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } /* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; struct ovs_dp_megaflow_stats dp_megaflow_stats; struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, flags, cmd); if (!ovs_header) goto error; ovs_header->dp_ifindex = get_dpifindex(dp); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); if (err) goto nla_put_failure; get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, sizeof(struct ovs_dp_megaflow_stats), &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, ovs_flow_tbl_masks_cache_size(&dp->table))) goto nla_put_failure; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) goto nla_put_failure; } genlmsg_end(skb, ovs_header); return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); error: return -EMSGSIZE; } static struct sk_buff *ovs_dp_cmd_alloc_info(void) { return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) { struct datapath *dp; if (!a[OVS_DP_ATTR_NAME]) dp = get_dp(net, ovs_header->dp_ifindex); else { struct vport *vport; vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; } return dp ? dp : ERR_PTR(-ENODEV); } static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) { struct datapath *dp; dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) return; pr_warn("%s: Dropping previously announced user features\n", ovs_dp_name(dp)); dp->user_features = 0; } static int ovs_dp_set_upcall_portids(struct datapath *dp, const struct nlattr *ids) { struct dp_nlsk_pids *old, *dp_nlsk_pids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(dp->upcall_portids); dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), GFP_KERNEL); if (!dp_nlsk_pids) return -ENOMEM; dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); kfree_rcu(old, rcu); return 0; } u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) { struct dp_nlsk_pids *dp_nlsk_pids; dp_nlsk_pids = rcu_dereference(dp->upcall_portids); if (dp_nlsk_pids) { if (cpu_id < dp_nlsk_pids->n_pids) { return dp_nlsk_pids->pids[cpu_id]; } else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) { /* If the number of netlink PIDs is mismatched with * the number of CPUs as seen by the kernel, log this * and send the upcall to an arbitrary socket (0) in * order to not drop packets */ pr_info_ratelimited("cpu_id mismatch with handler threads"); return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids]; } else { return 0; } } else { return 0; } } static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0, old_features = dp->user_features; int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | OVS_DP_F_TC_RECIRC_SHARING | OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (user_features & OVS_DP_F_TC_RECIRC_SHARING) return -EOPNOTSUPP; #endif } if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { int err; u32 cache_size; cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); if (err) return err; } dp->user_features = user_features; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && a[OVS_DP_ATTR_PER_CPU_PIDS]) { /* Upcall Netlink Port IDs have been updated */ err = ovs_dp_set_upcall_portids(dp, a[OVS_DP_ATTR_PER_CPU_PIDS]); if (err) return err; } if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_enable(); else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && (old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_disable(); return 0; } static int ovs_dp_stats_init(struct datapath *dp) { dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); if (!dp->stats_percpu) return -ENOMEM; return 0; } static int ovs_dp_vport_init(struct datapath *dp) { int i; dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dp->ports) return -ENOMEM; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); return 0; } static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct vport_parms parms; struct sk_buff *reply; struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) goto err_destroy_dp; err = ovs_dp_stats_init(dp); if (err) goto err_destroy_table; err = ovs_dp_vport_init(dp); if (err) goto err_destroy_stats; err = ovs_meters_init(dp); if (err) goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX] ? nla_get_s32(a[OVS_DP_ATTR_IFINDEX]) : 0; /* So far only local changes have been made, now need the lock. */ ovs_lock(); err = ovs_dp_change(dp, a); if (err) goto err_unlock_and_destroy_meters; vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); if (err == -EBUSY) err = -EEXIST; if (err == -EEXIST) { /* An outdated user space instance that does not understand * the concept of user_features has attempted to create a new * datapath and is likely to reuse it. Drop all user features. */ if (info->genlhdr->version < OVS_DP_VER_FEATURES) ovs_dp_reset_user_features(skb, info); } goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); err_destroy_ports: kfree(dp->ports); err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_destroy_dp: kfree(dp); err_destroy_reply: kfree_skb(reply); err: return err; } /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { struct flow_table *table = &dp->table; int i; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) tc_skb_ext_tc_disable(); for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) if (vport->port_no != OVSP_LOCAL) ovs_dp_detach_port(vport); } list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* Flush sw_flow in the tables. RCU cb only releases resource * such as dp, ports and tables. That may avoid some issues * such as RCU usage warning. */ table_instance_flow_flush(table, ovsl_dereference(table->ti), ovsl_dereference(table->ufid_ti)); /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_DEL); BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_change(dp, info->attrs); if (err) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_SET); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_GET); BUG_ON(err < 0); ovs_unlock(); return genlmsg_reply(reply, info); err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); struct datapath *dp; int skip = cb->args[0]; int i = 0; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_DP_CMD_GET) < 0) break; i++; } ovs_unlock(); cb->args[0] = i; return skb->len; } static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), }; static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_set, }, }; static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, .policy = datapath_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_datapath_genl_ops, .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), .resv_start_op = OVS_DP_CMD_SET + 1, .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = get_dpifindex(vport->dp); if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, ovs_vport_name(vport)) || nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; if (!net_eq(net, dev_net(vport->dev))) { int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) goto nla_put_failure; } ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), &vport_stats, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; if (ovs_vport_get_upcall_stats(vport, skb)) goto nla_put_failure; if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; genlmsg_end(skb, ovs_header); return 0; nla_put_failure: err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; } static struct sk_buff *ovs_vport_cmd_alloc_info(void) { return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, GFP_KERNEL); BUG_ON(retval < 0); return skb; } /* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) { struct datapath *dp; struct vport *vport; if (a[OVS_VPORT_ATTR_IFINDEX]) return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) return ERR_PTR(-ENODEV); if (ovs_header->dp_ifindex && ovs_header->dp_ifindex != get_dpifindex(vport->dp)) return ERR_PTR(-ENODEV); return vport; } else if (a[OVS_VPORT_ATTR_PORT_NO]) { u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); if (port_no >= DP_MAX_PORTS) return ERR_PTR(-EFBIG); dp = get_dp(net, ovs_header->dp_ifindex); if (!dp) return ERR_PTR(-ENODEV); vport = ovs_vport_ovsl_rcu(dp, port_no); if (!vport) return ERR_PTR(-ENODEV); return vport; } else return ERR_PTR(-EINVAL); } static unsigned int ovs_get_max_headroom(struct datapath *dp) { unsigned int dev_headroom, max_headroom = 0; struct net_device *dev; struct vport *vport; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } } return max_headroom; } /* Called with ovs_mutex */ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) { struct vport *vport; int i; dp->max_headroom = new_headroom; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; unsigned int new_headroom; u32 port_no; int err; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; port_no = a[OVS_VPORT_ATTR_PORT_NO] ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; if (port_no >= DP_MAX_PORTS) return -EFBIG; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto exit_unlock_free; if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) break; } } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX] ? nla_get_s32(a[OVS_VPORT_ATTR_IFINDEX]) : 0; vport = new_vport(&parms); err = PTR_ERR(vport); if (IS_ERR(vport)) { if (err == -EAGAIN) goto restart; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); new_headroom = netdev_get_fwd_headroom(vport->dev); if (new_headroom > dp->max_headroom) ovs_update_headroom(dp, new_headroom); else netdev_set_rx_headroom(vport->dev, dp->max_headroom); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_UPCALL_PID]) { struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; err = ovs_vport_set_upcall_portids(vport, ids); if (err) goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { bool update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; struct datapath *dp; struct vport *vport; unsigned int new_headroom; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ dp = vport->dp; if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) update_headroom = true; netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); if (update_headroom) { new_headroom = ovs_get_max_headroom(dp); if (new_headroom < dp->max_headroom) ovs_update_headroom(dp, new_headroom); } ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); exit_unlock_free: rcu_read_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct datapath *dp; int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; j = 0; hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_VPORT_CMD_GET, GFP_ATOMIC) < 0) goto out; j++; } skip = 0; } out: rcu_read_unlock(); cb->args[0] = i; cb->args[1] = j; return skb->len; } static void ovs_dp_masks_rebalance(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, masks_rebalance.work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) ovs_flow_masks_rebalance(&dp->table); ovs_unlock(); schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); } static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_set, }, }; struct genl_family dp_vport_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, .policy = vport_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_vport_genl_ops, .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static struct genl_family * const dp_genl_families[] = { &dp_datapath_genl_family, &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, &dp_meter_genl_family, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) &dp_ct_limit_genl_family, #endif }; static void dp_unregister_genl(int n_families) { int i; for (i = 0; i < n_families; i++) genl_unregister_family(dp_genl_families[i]); } static int __init dp_register_genl(void) { int err; int i; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { err = genl_register_family(dp_genl_families[i]); if (err) goto error; } return 0; error: dp_unregister_genl(i); return err; } static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); err = ovs_ct_init(net); if (err) return err; schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct list_head *head) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); struct datapath *dp; list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } } } static void __net_exit ovs_exit_net(struct net *dnet) { struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); struct vport *vport, *vport_next; struct net *net; LIST_HEAD(head); ovs_lock(); ovs_ct_exit(dnet); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); up_read(&net_rwsem); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { list_del(&vport->detach_list); ovs_dp_detach_port(vport); } ovs_unlock(); cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } static struct pernet_operations ovs_net_ops = { .init = ovs_init_net, .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), }; static const char * const ovs_drop_reasons[] = { #define S(x) (#x), OVS_DROP_REASONS(S) #undef S }; static struct drop_reason_list drop_reason_list_ovs = { .reasons = ovs_drop_reasons, .n_reasons = ARRAY_SIZE(ovs_drop_reasons), }; static int __init dp_init(void) { int err; BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); err = action_fifos_init(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) goto error_action_fifos_exit; err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; err = ovs_vport_init(); if (err) goto error_flow_exit; err = register_pernet_device(&ovs_net_ops); if (err) goto error_vport_exit; err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; err = ovs_netdev_init(); if (err) goto error_unreg_notifier; err = dp_register_genl(); if (err < 0) goto error_unreg_netdev; drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, &drop_reason_list_ovs); return 0; error_unreg_netdev: ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); error_action_fifos_exit: action_fifos_exit(); error: return err; } static void dp_cleanup(void) { dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); action_fifos_exit(); } module_init(dp_init); module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY); |
9 1 7 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | // SPDX-License-Identifier: GPL-2.0-only /* * xt_u32 - kernel module to match u32 packet content * * Original author: Don Cohen <don@isis.cs3-inc.com> * (C) CC Computer Consultants GmbH, 2007 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_u32.h> static bool u32_match_it(const struct xt_u32 *data, const struct sk_buff *skb) { const struct xt_u32_test *ct; unsigned int testind; unsigned int nnums; unsigned int nvals; unsigned int i; __be32 n; u_int32_t pos; u_int32_t val; u_int32_t at; /* * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17" * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands. */ for (testind = 0; testind < data->ntests; ++testind) { ct = &data->tests[testind]; at = 0; pos = ct->location[0].number; if (skb->len < 4 || pos > skb->len - 4) return false; if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0) BUG(); val = ntohl(n); nnums = ct->nnums; /* Inner loop runs over "&", "<<", ">>" and "@" operands */ for (i = 1; i < nnums; ++i) { u_int32_t number = ct->location[i].number; switch (ct->location[i].nextop) { case XT_U32_AND: val &= number; break; case XT_U32_LEFTSH: val <<= number; break; case XT_U32_RIGHTSH: val >>= number; break; case XT_U32_AT: if (at + val < at) return false; at += val; pos = number; if (at + 4 < at || skb->len < at + 4 || pos > skb->len - at - 4) return false; if (skb_copy_bits(skb, at + pos, &n, sizeof(n)) < 0) BUG(); val = ntohl(n); break; } } /* Run over the "," and ":" operands */ nvals = ct->nvalues; for (i = 0; i < nvals; ++i) if (ct->value[i].min <= val && val <= ct->value[i].max) break; if (i >= ct->nvalues) return false; } return true; } static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_u32 *data = par->matchinfo; bool ret; ret = u32_match_it(data, skb); return ret ^ data->invert; } static int u32_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_u32 *data = par->matchinfo; const struct xt_u32_test *ct; unsigned int i; if (data->ntests > ARRAY_SIZE(data->tests)) return -EINVAL; for (i = 0; i < data->ntests; ++i) { ct = &data->tests[i]; if (ct->nnums > ARRAY_SIZE(ct->location) || ct->nvalues > ARRAY_SIZE(ct->value)) return -EINVAL; } return 0; } static struct xt_match xt_u32_mt_reg __read_mostly = { .name = "u32", .revision = 0, .family = NFPROTO_UNSPEC, .match = u32_mt, .checkentry = u32_mt_checkentry, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }; static int __init u32_mt_init(void) { return xt_register_match(&xt_u32_mt_reg); } static void __exit u32_mt_exit(void) { xt_unregister_match(&xt_u32_mt_reg); } module_init(u32_mt_init); module_exit(u32_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_u32"); MODULE_ALIAS("ip6t_u32"); |
13875 8269 291 52 8499 5383 5383 5917 84 1375 5087 287 15 5296 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Percpu refcounts: * (C) 2012 Google, Inc. * Author: Kent Overstreet <koverstreet@google.com> * * This implements a refcount with similar semantics to atomic_t - atomic_inc(), * atomic_dec_and_test() - but percpu. * * There's one important difference between percpu refs and normal atomic_t * refcounts; you have to keep track of your initial refcount, and then when you * start shutting down you call percpu_ref_kill() _before_ dropping the initial * refcount. * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() * puts the ref back in single atomic_t mode, collecting the per cpu refs and * issuing the appropriate barriers, and then marks the ref as shutting down so * that percpu_ref_put() will check for the ref hitting 0. After it returns, * it's safe to drop the initial ref. * * USAGE: * * See fs/aio.c for some example usage; it's used there for struct kioctx, which * is created when userspaces calls io_setup(), and destroyed when userspace * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. * After that, there can't be any new users of the kioctx (from lookup_ioctx()) * and it's then safe to drop the initial ref with percpu_ref_put(). * * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't * imply RCU grace periods of any kind and if a user wants to combine percpu_ref * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped * once - percpu_ref_kill() does this for you, it returns true once and false if * someone else already called it. The aio code uses it this way, but it's not * necessary if the code has some other mechanism to synchronize teardown. * around. */ #ifndef _LINUX_PERCPU_REFCOUNT_H #define _LINUX_PERCPU_REFCOUNT_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/gfp.h> struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, __PERCPU_REF_FLAG_BITS = 2, }; /* @flags for percpu_ref_init() */ enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu * operation using percpu_ref_switch_to_percpu(). If initialized * with this flag, the ref will stay in atomic mode until * percpu_ref_switch_to_percpu() is invoked on it. * Implies ALLOW_REINIT. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, /* * Start dead w/ ref == 0 in atomic mode. Must be revived with * percpu_ref_reinit() before used. Implies INIT_ATOMIC and * ALLOW_REINIT. */ PERCPU_REF_INIT_DEAD = 1 << 1, /* * Allow switching from atomic mode to percpu mode. */ PERCPU_REF_ALLOW_REINIT = 1 << 2, }; struct percpu_ref_data { atomic_long_t count; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; bool allow_reinit:1; struct rcu_head rcu; struct percpu_ref *ref; }; struct percpu_ref { /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ unsigned long percpu_count_ptr; /* * 'percpu_ref' is often embedded into user structure, and only * 'percpu_count_ptr' is required in fast path, move other fields * into 'percpu_ref_data', so we can reduce memory footprint in * fast path. */ struct percpu_ref_data *data; }; int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); bool percpu_ref_is_zero(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref * @ref: percpu_ref to kill * * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * * Switches @ref into atomic mode before gathering up the percpu counters * and dropping the initial ref. * * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { percpu_ref_kill_and_confirm(ref, NULL); } /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ static inline bool __ref_is_percpu(struct percpu_ref *ref, unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr; /* * The value of @ref->percpu_count_ptr is tested for * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then * used as a pointer. If the compiler generates a separate fetch * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); /* * Theoretically, the following could test just ATOMIC; however, * then we'd have to mask off DEAD separately as DEAD may be * visible without ATOMIC if we race with percpu_ref_kill(). DEAD * implies ATOMIC anyway. Test them together. */ if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } /** * percpu_ref_get_many - increment a percpu refcount * @ref: percpu_ref to get * @nr: number of references to get * * Analogous to atomic_long_add(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_add(*percpu_count, nr); else atomic_long_add(nr, &ref->data->count); rcu_read_unlock(); } /** * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * * Analogous to atomic_long_inc(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get(struct percpu_ref *ref) { percpu_ref_get_many(ref, 1); } /** * percpu_ref_tryget_many - try to increment a percpu refcount * @ref: percpu_ref to try-get * @nr: number of references to get * * Increment a percpu refcount by @nr unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; bool ret; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_add(*percpu_count, nr); ret = true; } else { ret = atomic_long_add_unless(&ref->data->count, nr, 0); } rcu_read_unlock(); return ret; } /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { return percpu_ref_tryget_many(ref, 1); } /** * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the * caller is responsible for taking RCU. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; bool ret = false; WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(__ref_is_percpu(ref, &percpu_count))) { this_cpu_inc(*percpu_count); ret = true; } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { ret = atomic_long_inc_not_zero(&ref->data->count); } return ret; } /** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * * Completion of percpu_ref_kill() in itself doesn't guarantee that this * function will fail. For such guarantee, percpu_ref_kill_and_confirm() * should be used. After the confirm_kill callback is invoked, it's * guaranteed that no new reference will be given out by * percpu_ref_tryget_live(). * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { bool ret = false; rcu_read_lock(); ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); return ret; } /** * percpu_ref_put_many - decrement a percpu refcount * @ref: percpu_ref to put * @nr: number of references to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_sub(*percpu_count, nr); else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) ref->data->release(ref); rcu_read_unlock(); } /** * percpu_ref_put - decrement a percpu refcount * @ref: percpu_ref to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { percpu_ref_put_many(ref, 1); } /** * percpu_ref_is_dying - test whether a percpu refcount is dying or dead * @ref: percpu_ref to test * * Returns %true if @ref is dying or dead. * * This function is safe to call as long as @ref is between init and exit * and the caller is responsible for synchronizing against state changes. */ static inline bool percpu_ref_is_dying(struct percpu_ref *ref) { return ref->percpu_count_ptr & __PERCPU_REF_DEAD; } #endif |
18 19 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | // SPDX-License-Identifier: GPL-2.0-only /* * ksyms_common.c: A split of kernel/kallsyms.c * Contains a few generic function definations independent of config KALLSYMS. */ #include <linux/kallsyms.h> #include <linux/security.h> static inline int kallsyms_for_perf(void) { #ifdef CONFIG_PERF_EVENTS extern int sysctl_perf_event_paranoid; if (sysctl_perf_event_paranoid <= 1) return 1; #endif return 0; } /* * We show kallsyms information even to normal users if we've enabled * kernel profiling and are explicitly not paranoid (so kptr_restrict * is clear, and sysctl_perf_event_paranoid isn't set). * * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) return true; fallthrough; case 1: if (security_capable(cred, &init_user_ns, CAP_SYSLOG, CAP_OPT_NOAUDIT) == 0) return true; fallthrough; default: return false; } } |
2 1 2 2 5 2 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 | // SPDX-License-Identifier: GPL-2.0-or-later /* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * Tyler Hicks <code@tyhicks.com> */ #include <linux/dcache.h> #include <linux/file.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" /* * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; module_param(ecryptfs_verbosity, int, 0); MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); /* * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); /* * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; module_param(ecryptfs_message_wait_timeout, long, 0); MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "Maximum number of seconds that an operation will " "sleep while waiting for a message response from " "userspace"); /* * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. */ unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); void __ecryptfs_printk(const char *fmt, ...) { va_list args; va_start(args, fmt); if (fmt[1] == '7') { /* KERN_DEBUG */ if (ecryptfs_verbosity >= 1) vprintk(fmt, args); } else vprintk(fmt, args); va_end(args); } /* * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set * * eCryptfs only ever keeps a single open file for every lower * inode. All I/O operations to the lower inode occur through that * file. When the first eCryptfs dentry that interposes with the first * lower dentry for that inode is created, this function creates the * lower file struct and associates it with the eCryptfs * inode. When all eCryptfs files associated with the inode are released, the * file is closed. * * The lower file will be opened with read/write permissions, if * possible. Otherwise, it is opened read-only. * * This function does nothing if a lower file is already * associated with the eCryptfs inode. * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); const struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", path->dentry, path->mnt, rc); (*lower_file) = NULL; } return rc; } int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) { struct ecryptfs_inode_info *inode_info; int count, rc = 0; inode_info = ecryptfs_inode_to_private(inode); mutex_lock(&inode_info->lower_file_mutex); count = atomic_inc_return(&inode_info->lower_file_count); if (WARN_ON_ONCE(count < 1)) rc = -EINVAL; else if (count == 1) { rc = ecryptfs_init_lower_file(dentry, &inode_info->lower_file); if (rc) atomic_set(&inode_info->lower_file_count, 0); } mutex_unlock(&inode_info->lower_file_mutex); return rc; } void ecryptfs_put_lower_file(struct inode *inode) { struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, &inode_info->lower_file_mutex)) { filemap_write_and_wait(inode->i_mapping); fput(inode_info->lower_file); inode_info->lower_file = NULL; mutex_unlock(&inode_info->lower_file_mutex); } } enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; static int ecryptfs_init_global_auth_toks( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { struct ecryptfs_global_auth_tok *global_auth_tok; struct ecryptfs_auth_tok *auth_tok; int rc = 0; list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { rc = ecryptfs_keyring_auth_tok_for_sig( &global_auth_tok->global_auth_tok_key, &auth_tok, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Could not find valid key in user " "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; goto out; } else { global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; up_write(&(global_auth_tok->global_auth_tok_key)->sem); } } out: return rc; } static void ecryptfs_init_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { memset((void *)mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; } /** * ecryptfs_parse_options * @sbi: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output * sig=XXX - description(signature) of the key to use * * Returns the dentry object of the lower-level (lower/interposed) * directory; We want to mount our stackable file system on top of * that lower directory. * * The signature of the key to use must be the description of a key * already in the keyring. Mounting will fail if the key can not be * found. * * Returns zero on success; non-zero on error */ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, uid_t *check_ruid) { char *p; int rc = 0; int sig_set = 0; int cipher_name_set = 0; int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; char *cipher_name_src; char *fn_cipher_name_src; char *fnek_src; char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; u8 cipher_code; *check_ruid = 0; if (!options) { rc = -EINVAL; goto out; } ecryptfs_init_mount_crypt_stat(mount_crypt_stat); while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case ecryptfs_opt_sig: case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); goto out; } sig_set = 1; break; case ecryptfs_opt_cipher: case ecryptfs_opt_ecryptfs_cipher: cipher_name_src = args[0].from; strscpy(mount_crypt_stat->global_default_cipher_name, cipher_name_src); cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: cipher_key_bytes_src = args[0].from; cipher_key_bytes = (int)simple_strtol(cipher_key_bytes_src, &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: mount_crypt_stat->flags |= ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; break; case ecryptfs_opt_xattr_metadata: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; break; case ecryptfs_opt_encrypted_view: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; case ecryptfs_opt_fnek_sig: fnek_src = args[0].from; strscpy(mount_crypt_stat->global_default_fnek_sig, fnek_src); rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig, ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", mount_crypt_stat->global_default_fnek_sig, rc); goto out; } mount_crypt_stat->flags |= (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); break; case ecryptfs_opt_fn_cipher: fn_cipher_name_src = args[0].from; strscpy(mount_crypt_stat->global_default_fn_cipher_name, fn_cipher_name_src); fn_cipher_name_set = 1; break; case ecryptfs_opt_fn_cipher_key_bytes: fn_cipher_key_bytes_src = args[0].from; fn_cipher_key_bytes = (int)simple_strtol(fn_cipher_key_bytes_src, &fn_cipher_key_bytes_src, 0); mount_crypt_stat->global_default_fn_cipher_key_bytes = fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; case ecryptfs_opt_mount_auth_tok_only: mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; case ecryptfs_opt_check_dev_ruid: *check_ruid = 1; break; case ecryptfs_opt_err: default: printk(KERN_WARNING "%s: eCryptfs: unrecognized option [%s]\n", __func__, p); } } if (!sig_set) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "You must supply at least one valid " "auth tok signature as a mount " "parameter; see the eCryptfs README\n"); goto out; } if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_name_set) strcpy(mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_cipher_name); if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; cipher_code = ecryptfs_code_for_cipher_string( mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, "eCryptfs doesn't support cipher: %s\n", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; } mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !ecryptfs_tfm_exists( mount_crypt_stat->global_default_fn_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); out: return rc; } struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /* * ecryptfs_mount * @fs_type: The filesystem type that the superblock should belong to * @flags: The flags associated with the mount * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); if (!sbi) { rc = -ENOMEM; goto out; } if (!dev_name) { rc = -EINVAL; err = "Device name cannot be null"; goto out; } rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; } mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; } rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; s->s_xattr = ecryptfs_xattr_handlers; s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); goto out1; } if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { rc = -EINVAL; printk(KERN_ERR "Mount on filesystem of type " "eCryptfs explicitly disallowed due to " "known incompatibilities\n"); goto out_free; } if (is_idmapped_mnt(path.mnt)) { rc = -EINVAL; printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); goto out_free; } if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } ecryptfs_set_superblock_lower(s, path.dentry->d_sb); /** * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ s->s_flags = flags & ~SB_POSIXACL; s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** * Force a read-only eCryptfs mount when: * 1) The lower mount is ro * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) s->s_flags |= SB_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; rc = -EINVAL; if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); goto out_free; } inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; s->s_root = d_make_root(inode); if (!s->s_root) { rc = -ENOMEM; goto out_free; } rc = -ENOMEM; root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!root_info) goto out_free; /* ->kill_sb() will take care of root_info */ ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; s->s_flags |= SB_ACTIVE; return dget(s->s_root); out_free: path_put(&path); out1: deactivate_locked_super(s); out: if (sbi) { ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return ERR_PTR(rc); } /** * ecryptfs_kill_block_super * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. */ static void ecryptfs_kill_block_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); kill_anon_super(sb); if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; MODULE_ALIAS_FS("ecryptfs"); /* * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created */ static void inode_info_init_once(void *vptr) { struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; inode_init_once(&ei->vfs_inode); } static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { .cache = &ecryptfs_auth_tok_list_item_cache, .name = "ecryptfs_auth_tok_list_item", .size = sizeof(struct ecryptfs_auth_tok_list_item), }, { .cache = &ecryptfs_file_info_cache, .name = "ecryptfs_file_cache", .size = sizeof(struct ecryptfs_file_info), }, { .cache = &ecryptfs_dentry_info_cache, .name = "ecryptfs_dentry_info_cache", .size = sizeof(struct ecryptfs_dentry_info), }, { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { .cache = &ecryptfs_sb_info_cache, .name = "ecryptfs_sb_cache", .size = sizeof(struct ecryptfs_sb_info), }, { .cache = &ecryptfs_header_cache, .name = "ecryptfs_headers", .size = PAGE_SIZE, }, { .cache = &ecryptfs_xattr_cache, .name = "ecryptfs_xattr_cache", .size = PAGE_SIZE, }, { .cache = &ecryptfs_key_record_cache, .name = "ecryptfs_key_record_cache", .size = sizeof(struct ecryptfs_key_record), }, { .cache = &ecryptfs_key_sig_cache, .name = "ecryptfs_key_sig_cache", .size = sizeof(struct ecryptfs_key_sig), }, { .cache = &ecryptfs_global_auth_tok_cache, .name = "ecryptfs_global_auth_tok_cache", .size = sizeof(struct ecryptfs_global_auth_tok), }, { .cache = &ecryptfs_key_tfm_cache, .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, }; static void ecryptfs_free_kmem_caches(void) { int i; /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; kmem_cache_destroy(*(info->cache)); } } /** * ecryptfs_init_kmem_caches * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_kmem_caches(void) { int i; for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; *(info->cache) = kmem_cache_create(info->name, info->size, 0, SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " "kmem_cache_create failed\n", info->name); return -ENOMEM; } } return 0; } static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); static struct attribute *attributes[] = { &version_attr.attr, NULL, }; static const struct attribute_group attr_group = { .attrs = attributes, }; static int do_sysfs_registration(void) { int rc; ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); if (!ecryptfs_kobj) { printk(KERN_ERR "Unable to create ecryptfs kset\n"); rc = -ENOMEM; goto out; } rc = sysfs_create_group(ecryptfs_kobj, &attr_group); if (rc) { printk(KERN_ERR "Unable to create ecryptfs version attributes\n"); kobject_put(ecryptfs_kobj); } out: return rc; } static void do_sysfs_unregistration(void) { sysfs_remove_group(ecryptfs_kobj, &attr_group); kobject_put(ecryptfs_kobj); } static int __init ecryptfs_init(void) { int rc; if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " "default eCryptfs extent size is [%u] bytes; " "the page size is [%lu] bytes.\n", ECRYPTFS_DEFAULT_EXTENT_SIZE, (unsigned long)PAGE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); if (rc) { printk(KERN_ERR "Failed to allocate one or more kmem_cache objects\n"); goto out; } rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); goto out_free_kmem_caches; } rc = ecryptfs_init_kthread(); if (rc) { printk(KERN_ERR "%s: kthread initialization failed; " "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occurred while attempting to " "initialize the communications channel to " "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { printk(KERN_ERR "Failure whilst attempting to init crypto; " "rc = [%d]\n", rc); goto out_release_messaging; } rc = register_filesystem(&ecryptfs_fs_type); if (rc) { printk(KERN_ERR "Failed to register filesystem\n"); goto out_destroy_crypto; } if (ecryptfs_verbosity > 0) printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " "will be written to the syslog!\n", ecryptfs_verbosity); goto out; out_destroy_crypto: ecryptfs_destroy_crypto(); out_release_messaging: ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_free_kmem_caches: ecryptfs_free_kmem_caches(); out: return rc; } static void __exit ecryptfs_exit(void) { int rc; rc = ecryptfs_destroy_crypto(); if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); } MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>"); MODULE_DESCRIPTION("eCryptfs"); MODULE_LICENSE("GPL"); module_init(ecryptfs_init) module_exit(ecryptfs_exit) |
9 9 9 13 13 9 9 9 9 9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi_logging.c * * Copyright (C) 2014 SUSE Linux Products GmbH * Copyright (C) 2014 Hannes Reinecke <hare@suse.de> */ #include <linux/kernel.h> #include <linux/atomic.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_dbg.h> static char *scsi_log_reserve_buffer(size_t *len) { *len = 128; return kmalloc(*len, GFP_ATOMIC); } static void scsi_log_release_buffer(char *bufptr) { kfree(bufptr); } static inline const char *scmd_name(const struct scsi_cmnd *scmd) { struct request *rq = scsi_cmd_to_rq((struct scsi_cmnd *)scmd); if (!rq->q || !rq->q->disk) return NULL; return rq->q->disk->disk_name; } static size_t sdev_format_header(char *logbuf, size_t logbuf_len, const char *name, int tag) { size_t off = 0; if (name) off += scnprintf(logbuf + off, logbuf_len - off, "[%s] ", name); if (WARN_ON(off >= logbuf_len)) return off; if (tag >= 0) off += scnprintf(logbuf + off, logbuf_len - off, "tag#%d ", tag); return off; } void sdev_prefix_printk(const char *level, const struct scsi_device *sdev, const char *name, const char *fmt, ...) { va_list args; char *logbuf; size_t off = 0, logbuf_len; if (!sdev) return; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; if (name) off += scnprintf(logbuf + off, logbuf_len - off, "[%s] ", name); if (!WARN_ON(off >= logbuf_len)) { va_start(args, fmt); off += vscnprintf(logbuf + off, logbuf_len - off, fmt, args); va_end(args); } dev_printk(level, &sdev->sdev_gendev, "%s", logbuf); scsi_log_release_buffer(logbuf); } EXPORT_SYMBOL(sdev_prefix_printk); void scmd_printk(const char *level, const struct scsi_cmnd *scmd, const char *fmt, ...) { va_list args; char *logbuf; size_t off = 0, logbuf_len; if (!scmd) return; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; off = sdev_format_header(logbuf, logbuf_len, scmd_name(scmd), scsi_cmd_to_rq((struct scsi_cmnd *)scmd)->tag); if (off < logbuf_len) { va_start(args, fmt); off += vscnprintf(logbuf + off, logbuf_len - off, fmt, args); va_end(args); } dev_printk(level, &scmd->device->sdev_gendev, "%s", logbuf); scsi_log_release_buffer(logbuf); } EXPORT_SYMBOL(scmd_printk); static size_t scsi_format_opcode_name(char *buffer, size_t buf_len, const unsigned char *cdbp) { int sa, cdb0; const char *cdb_name = NULL, *sa_name = NULL; size_t off; cdb0 = cdbp[0]; if (cdb0 == VARIABLE_LENGTH_CMD) { int len = scsi_varlen_cdb_length(cdbp); if (len < 10) { off = scnprintf(buffer, buf_len, "short variable length command, len=%d", len); return off; } sa = (cdbp[8] << 8) + cdbp[9]; } else sa = cdbp[1] & 0x1f; if (!scsi_opcode_sa_name(cdb0, sa, &cdb_name, &sa_name)) { if (cdb_name) off = scnprintf(buffer, buf_len, "%s", cdb_name); else { off = scnprintf(buffer, buf_len, "opcode=0x%x", cdb0); if (WARN_ON(off >= buf_len)) return off; if (cdb0 >= VENDOR_SPECIFIC_CDB) off += scnprintf(buffer + off, buf_len - off, " (vendor)"); else if (cdb0 >= 0x60 && cdb0 < 0x7e) off += scnprintf(buffer + off, buf_len - off, " (reserved)"); } } else { if (sa_name) off = scnprintf(buffer, buf_len, "%s", sa_name); else if (cdb_name) off = scnprintf(buffer, buf_len, "%s, sa=0x%x", cdb_name, sa); else off = scnprintf(buffer, buf_len, "opcode=0x%x, sa=0x%x", cdb0, sa); } WARN_ON(off >= buf_len); return off; } size_t __scsi_format_command(char *logbuf, size_t logbuf_len, const unsigned char *cdb, size_t cdb_len) { int len, k; size_t off; off = scsi_format_opcode_name(logbuf, logbuf_len, cdb); if (off >= logbuf_len) return off; len = scsi_command_size(cdb); if (cdb_len < len) len = cdb_len; /* print out all bytes in cdb */ for (k = 0; k < len; ++k) { if (off > logbuf_len - 3) break; off += scnprintf(logbuf + off, logbuf_len - off, " %02x", cdb[k]); } return off; } EXPORT_SYMBOL(__scsi_format_command); void scsi_print_command(struct scsi_cmnd *cmd) { int k; char *logbuf; size_t off, logbuf_len; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; off = sdev_format_header(logbuf, logbuf_len, scmd_name(cmd), scsi_cmd_to_rq(cmd)->tag); if (off >= logbuf_len) goto out_printk; off += scnprintf(logbuf + off, logbuf_len - off, "CDB: "); if (WARN_ON(off >= logbuf_len)) goto out_printk; off += scsi_format_opcode_name(logbuf + off, logbuf_len - off, cmd->cmnd); if (off >= logbuf_len) goto out_printk; /* print out all bytes in cdb */ if (cmd->cmd_len > 16) { /* Print opcode in one line and use separate lines for CDB */ off += scnprintf(logbuf + off, logbuf_len - off, "\n"); dev_printk(KERN_INFO, &cmd->device->sdev_gendev, "%s", logbuf); for (k = 0; k < cmd->cmd_len; k += 16) { size_t linelen = min(cmd->cmd_len - k, 16); off = sdev_format_header(logbuf, logbuf_len, scmd_name(cmd), scsi_cmd_to_rq(cmd)->tag); if (!WARN_ON(off > logbuf_len - 58)) { off += scnprintf(logbuf + off, logbuf_len - off, "CDB[%02x]: ", k); hex_dump_to_buffer(&cmd->cmnd[k], linelen, 16, 1, logbuf + off, logbuf_len - off, false); } dev_printk(KERN_INFO, &cmd->device->sdev_gendev, "%s", logbuf); } goto out; } if (!WARN_ON(off > logbuf_len - 49)) { off += scnprintf(logbuf + off, logbuf_len - off, " "); hex_dump_to_buffer(cmd->cmnd, cmd->cmd_len, 16, 1, logbuf + off, logbuf_len - off, false); } out_printk: dev_printk(KERN_INFO, &cmd->device->sdev_gendev, "%s", logbuf); out: scsi_log_release_buffer(logbuf); } EXPORT_SYMBOL(scsi_print_command); static size_t scsi_format_extd_sense(char *buffer, size_t buf_len, unsigned char asc, unsigned char ascq) { size_t off = 0; const char *extd_sense_fmt = NULL; const char *extd_sense_str = scsi_extd_sense_format(asc, ascq, &extd_sense_fmt); if (extd_sense_str) { off = scnprintf(buffer, buf_len, "Add. Sense: %s", extd_sense_str); if (extd_sense_fmt) off += scnprintf(buffer + off, buf_len - off, "(%s%x)", extd_sense_fmt, ascq); } else { if (asc >= 0x80) off = scnprintf(buffer, buf_len, "<<vendor>>"); off += scnprintf(buffer + off, buf_len - off, "ASC=0x%x ", asc); if (ascq >= 0x80) off += scnprintf(buffer + off, buf_len - off, "<<vendor>>"); off += scnprintf(buffer + off, buf_len - off, "ASCQ=0x%x ", ascq); } return off; } static size_t scsi_format_sense_hdr(char *buffer, size_t buf_len, const struct scsi_sense_hdr *sshdr) { const char *sense_txt; size_t off; off = scnprintf(buffer, buf_len, "Sense Key : "); sense_txt = scsi_sense_key_string(sshdr->sense_key); if (sense_txt) off += scnprintf(buffer + off, buf_len - off, "%s ", sense_txt); else off += scnprintf(buffer + off, buf_len - off, "0x%x ", sshdr->sense_key); off += scnprintf(buffer + off, buf_len - off, scsi_sense_is_deferred(sshdr) ? "[deferred] " : "[current] "); if (sshdr->response_code >= 0x72) off += scnprintf(buffer + off, buf_len - off, "[descriptor] "); return off; } static void scsi_log_dump_sense(const struct scsi_device *sdev, const char *name, int tag, const unsigned char *sense_buffer, int sense_len) { char *logbuf; size_t logbuf_len; int i; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; for (i = 0; i < sense_len; i += 16) { int len = min(sense_len - i, 16); size_t off; off = sdev_format_header(logbuf, logbuf_len, name, tag); hex_dump_to_buffer(&sense_buffer[i], len, 16, 1, logbuf + off, logbuf_len - off, false); dev_printk(KERN_INFO, &sdev->sdev_gendev, "%s", logbuf); } scsi_log_release_buffer(logbuf); } static void scsi_log_print_sense_hdr(const struct scsi_device *sdev, const char *name, int tag, const struct scsi_sense_hdr *sshdr) { char *logbuf; size_t off, logbuf_len; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; off = sdev_format_header(logbuf, logbuf_len, name, tag); off += scsi_format_sense_hdr(logbuf + off, logbuf_len - off, sshdr); dev_printk(KERN_INFO, &sdev->sdev_gendev, "%s", logbuf); scsi_log_release_buffer(logbuf); logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; off = sdev_format_header(logbuf, logbuf_len, name, tag); off += scsi_format_extd_sense(logbuf + off, logbuf_len - off, sshdr->asc, sshdr->ascq); dev_printk(KERN_INFO, &sdev->sdev_gendev, "%s", logbuf); scsi_log_release_buffer(logbuf); } static void scsi_log_print_sense(const struct scsi_device *sdev, const char *name, int tag, const unsigned char *sense_buffer, int sense_len) { struct scsi_sense_hdr sshdr; if (scsi_normalize_sense(sense_buffer, sense_len, &sshdr)) scsi_log_print_sense_hdr(sdev, name, tag, &sshdr); else scsi_log_dump_sense(sdev, name, tag, sense_buffer, sense_len); } /* * Print normalized SCSI sense header with a prefix. */ void scsi_print_sense_hdr(const struct scsi_device *sdev, const char *name, const struct scsi_sense_hdr *sshdr) { scsi_log_print_sense_hdr(sdev, name, -1, sshdr); } EXPORT_SYMBOL(scsi_print_sense_hdr); /* Normalize and print sense buffer with name prefix */ void __scsi_print_sense(const struct scsi_device *sdev, const char *name, const unsigned char *sense_buffer, int sense_len) { scsi_log_print_sense(sdev, name, -1, sense_buffer, sense_len); } EXPORT_SYMBOL(__scsi_print_sense); /* Normalize and print sense buffer in SCSI command */ void scsi_print_sense(const struct scsi_cmnd *cmd) { scsi_log_print_sense(cmd->device, scmd_name(cmd), scsi_cmd_to_rq((struct scsi_cmnd *)cmd)->tag, cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); } EXPORT_SYMBOL(scsi_print_sense); void scsi_print_result(const struct scsi_cmnd *cmd, const char *msg, int disposition) { char *logbuf; size_t off, logbuf_len; const char *mlret_string = scsi_mlreturn_string(disposition); const char *hb_string = scsi_hostbyte_string(cmd->result); unsigned long cmd_age = (jiffies - cmd->jiffies_at_alloc) / HZ; logbuf = scsi_log_reserve_buffer(&logbuf_len); if (!logbuf) return; off = sdev_format_header(logbuf, logbuf_len, scmd_name(cmd), scsi_cmd_to_rq((struct scsi_cmnd *)cmd)->tag); if (off >= logbuf_len) goto out_printk; if (msg) { off += scnprintf(logbuf + off, logbuf_len - off, "%s: ", msg); if (WARN_ON(off >= logbuf_len)) goto out_printk; } if (mlret_string) off += scnprintf(logbuf + off, logbuf_len - off, "%s ", mlret_string); else off += scnprintf(logbuf + off, logbuf_len - off, "UNKNOWN(0x%02x) ", disposition); if (WARN_ON(off >= logbuf_len)) goto out_printk; off += scnprintf(logbuf + off, logbuf_len - off, "Result: "); if (WARN_ON(off >= logbuf_len)) goto out_printk; if (hb_string) off += scnprintf(logbuf + off, logbuf_len - off, "hostbyte=%s ", hb_string); else off += scnprintf(logbuf + off, logbuf_len - off, "hostbyte=0x%02x ", host_byte(cmd->result)); if (WARN_ON(off >= logbuf_len)) goto out_printk; off += scnprintf(logbuf + off, logbuf_len - off, "driverbyte=DRIVER_OK "); off += scnprintf(logbuf + off, logbuf_len - off, "cmd_age=%lus", cmd_age); out_printk: dev_printk(KERN_INFO, &cmd->device->sdev_gendev, "%s", logbuf); scsi_log_release_buffer(logbuf); } EXPORT_SYMBOL(scsi_print_result); |
6 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | // SPDX-License-Identifier: GPL-2.0-only /* module that allows mangling of the arp payload */ #include <linux/module.h> #include <linux/netfilter.h> #include <linux/netfilter_arp/arpt_mangle.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int target(struct sk_buff *skb, const struct xt_action_param *par) { const struct arpt_mangle *mangle = par->targinfo; const struct arphdr *arp; unsigned char *arpptr; int pln, hln; if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); arpptr = skb_network_header(skb) + sizeof(*arp); pln = arp->ar_pln; hln = arp->ar_hln; /* We assume that pln and hln were checked in the match */ if (mangle->flags & ARPT_MANGLE_SDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || (arpptr + hln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, mangle->src_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_SIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || (arpptr + pln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, &mangle->u_s.src_ip, pln); } arpptr += pln; if (mangle->flags & ARPT_MANGLE_TDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || (arpptr + hln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, mangle->tgt_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_TIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || (arpptr + pln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, &mangle->u_t.tgt_ip, pln); } return mangle->target; } static int checkentry(const struct xt_tgchk_param *par) { const struct arpt_mangle *mangle = par->targinfo; if (mangle->flags & ~ARPT_MANGLE_MASK || !(mangle->flags & ARPT_MANGLE_MASK)) return -EINVAL; if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && mangle->target != XT_CONTINUE) return -EINVAL; return 0; } static struct xt_target arpt_mangle_reg __read_mostly = { .name = "mangle", .family = NFPROTO_ARP, .target = target, .targetsize = sizeof(struct arpt_mangle), .checkentry = checkentry, .me = THIS_MODULE, }; static int __init arpt_mangle_init(void) { return xt_register_target(&arpt_mangle_reg); } static void __exit arpt_mangle_fini(void) { xt_unregister_target(&arpt_mangle_reg); } module_init(arpt_mangle_init); module_exit(arpt_mangle_fini); |
30 30 2 30 27 12 1 12 1 12 12 3 3 11 12 12 8 7 12 1 8 1 12 12 12 12 12 12 16 12 4 4 12 12 6 12 12 12 12 12 6 12 6 12 12 12 12 12 12 12 18 18 5 1 4 4 14 1 4 4 4 4 4 30 27 3 29 30 30 74 69 5 22 17 12 6 3 64 4 62 46 4 13 10 2 3 4 27 10 5 9 3 2 80 2 6 29 1 105 104 93 12 3 2 2 5 55 2 14 2 2 11 3 20 13 19 1 59 3 1 1 82 2 1 71 2 2 11 1 51 15 61 53 54 10 25 35 3 30 5 31 14 28 2 30 30 2 30 80 80 48 32 13 13 13 7 13 13 13 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> #include <linux/ratelimit.h> #include <linux/export.h> #include <linux/sizes.h> #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog * will kick our shin. * Also, it seems fairer to not let one busy connection stall all the * others. * * send_batch_count is the number of times we'll loop in send_xmit. Setting * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); static void rds_send_remove_from_sock(struct list_head *messages, int status); /* * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; if (cp->cp_xmit_rm) { rm = cp->cp_xmit_rm; cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ rds_message_unmapped(rm); rds_message_put(rm); } cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_data_sent = 0; cp->cp_conn->c_map_queued = 0; cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); spin_unlock_irqrestore(&cp->cp_lock, flags); } EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { return test_and_set_bit_lock(RDS_IN_XMIT, &cp->cp_flags) == 0; } static void release_in_xmit(struct rds_conn_path *cp) { clear_bit_unlock(RDS_IN_XMIT, &cp->cp_flags); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ if (waitqueue_active(&cp->cp_waitq)) wake_up_all(&cp->cp_waitq); } /* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list * - reassembly is optional and easily done by transports per conn * - no per flow rx lookup at all, straight to the socket * - less per-frag memory and wire overhead * Con: * - queued acks can be delayed behind large messages * Depends: * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ int rds_send_xmit(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); int batch_count; unsigned long send_gen = 0; int same_rm = 0; restart: batch_count = 0; /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. */ if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } if (rds_destroy_pending(cp->cp_conn)) { release_in_xmit(cp); ret = -ENETUNREACH; /* dont requeue send work */ goto out; } /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use * this to avoid a goto restart farther down. * * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ send_gen = READ_ONCE(cp->cp_send_gen) + 1; WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ if (!rds_conn_path_up(cp)) { release_in_xmit(cp); ret = 0; goto out; } if (conn->c_trans->xmit_path_prepare) conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. */ while (1) { rm = cp->cp_xmit_rm; if (!rm) { same_rm = 0; } else { same_rm++; if (same_rm >= 4096) { rds_stats_inc(s_send_stuck_rm); ret = -EAGAIN; break; } } /* * If between sending messages, we can send a pending congestion * map update. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } rm->data.op_active = 1; rm->m_inc.i_conn_path = cp; rm->m_inc.i_conn = cp->cp_conn; cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ if (!rm) { unsigned int len; batch_count++; /* we want to process as big a batch as we can, but * we also want to avoid softlockups. If we've been * through a lot of messages, lets back off and see * if anyone else jumps in */ if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); /* * Move the message from the send queue to the retransmit * list right away. */ list_move_tail(&rm->m_conn_item, &cp->cp_retrans); } spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire * queue pair to error state. We could possibly * recover from that, but right now we drop the * connection. * Therefore, we never retransmit messages with RDMA ops. */ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); if (cp->cp_unacked_packets == 0 || cp->cp_unacked_bytes < len) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { cp->cp_unacked_bytes -= len; cp->cp_unacked_packets--; } cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_rdma_sent = 1; } if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_atomic_sent = 1; } /* * A number of cases require an RDS header to be sent * even if there is no data. * We permit 0-byte sends; rds-ping depends on this. * However, if there are exclusively attached silent ops, * we skip the hdr/data send, to enable silent operation. */ if (rm->data.op_nents == 0) { int ops_present; int all_ops_are_silent = 1; ops_present = (rm->atomic.op_active || rm->rdma.op_active); if (rm->atomic.op_active && !rm->atomic.op_silent) all_ops_are_silent = 0; if (rm->rdma.op_active && !rm->rdma.op_silent) all_ops_are_silent = 0; if (ops_present && all_ops_are_silent && !rm->m_rdma_cookie) rm->data.op_active = 0; } if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, cp->cp_xmit_hdr_off, cp->cp_xmit_sg, cp->cp_xmit_data_off); if (ret <= 0) break; if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - cp->cp_xmit_hdr_off); cp->cp_xmit_hdr_off += tmp; ret -= tmp; } sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - cp->cp_xmit_data_off); cp->cp_xmit_data_off += tmp; ret -= tmp; if (cp->cp_xmit_data_off == sg->length) { cp->cp_xmit_data_off = 0; sg++; cp->cp_xmit_sg++; BUG_ON(ret != 0 && cp->cp_xmit_sg == rm->data.op_nents); } } if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && (cp->cp_xmit_sg == rm->data.op_nents)) cp->cp_xmit_data_sent = 1; } /* * A rm will only take multiple times through this loop * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ if (!rm->data.op_active || cp->cp_xmit_data_sent) { cp->cp_xmit_rm = NULL; cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: if (conn->c_trans->xmit_path_complete) conn->c_trans->xmit_path_complete(cp); release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { /* irqs on here, so we can put(), unlike above */ list_for_each_entry(rm, &to_be_dropped, m_conn_item) rds_message_put(rm); rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } /* * Other senders can queue a message after we last test the send queue * but before we clear RDS_IN_XMIT. In that case they'd back off and * not try and send their newly queued message. We need to check the * send queue after having cleared RDS_IN_XMIT so that their message * doesn't get stuck on the send queue. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. * * We have an extra generation check here so that if someone manages * to jump in after our release_in_xmit, we'll see that they have done * some work and we will skip our goto */ if (ret == 0) { bool raced; smp_mb(); raced = send_gen != READ_ONCE(cp->cp_send_gen); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } } out: return ret; } EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); assert_spin_locked(&rs->rs_lock); BUG_ON(rs->rs_snd_bytes < len); rs->rs_snd_bytes -= len; if (rs->rs_snd_bytes == 0) rds_stats_inc(s_send_queue_empty); } static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, is_acked_func is_acked) { if (is_acked) return is_acked(rm, ack); return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; } /* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get * the IB send completion on the RDMA op and the accompanying * message. */ void rds_rdma_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ro->op_active && ro->op_notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ro->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* * Just like above, except looks at atomic op */ void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_atomic_op *ao; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ao->op_active && ao->op_notify && ao->op_notifier) { notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ao->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we * don't do any locking - we have all the ingredients (message, * socket, socket lock) and can just move the notifier. */ static inline void __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; struct rm_atomic_op *ao; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { ro->op_notifier->n_status = status; list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); ro->op_notifier = NULL; } ao = &rm->atomic; if (ao->op_active && ao->op_notify && ao->op_notifier) { ao->op_notifier->n_status = status; list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ } /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it * without locks. The messages must have a reference held for their * position on the list. This function will drop that reference after * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ static void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; while (!list_empty(messages)) { int was_on_sock = 0; rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); /* * If we see this flag cleared then we're *sure* that someone * else beat us to removing it from the sock. If we race * with their flag update we'll get the lock and then really * see that the flag has been cleared. * * The message spinlock makes sure nobody clears rm->m_rs * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; if (rs) sock_hold(rds_rs_to_sk(rs)); } if (!rs) goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); if (ro->op_active && ro->op_notifier && (ro->op_notify || (ro->op_recverr && status))) { notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; rm->rdma.op_notifier = NULL; } was_on_sock = 1; } spin_unlock(&rs->rs_lock); unlock_and_drop: spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); if (was_on_sock) rds_message_put(rm); } if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } /* * Transports call here when they've determined that the receiver queued * messages up to, and including, the given sequence number. Messages are * moved to the retrans queue when rds_send_xmit picks them off the send * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; list_move(&rm->m_conn_item, &list); clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); } /* order flag updates with spin locks */ if (!list_empty(&list)) smp_mb__after_atomic(); spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) { WARN_ON(conn->c_trans->t_mp_capable); rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); } EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { if (dest && (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); if (list_empty(&list)) return; /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; if (conn->c_trans->t_mp_capable) cp = rm->m_inc.i_conn_path; else cp = &conn->c_path[0]; spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } list_del_init(&rm->m_conn_item); spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), * but we can now. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); rds_message_wait(rm); /* just in case the code above skipped this message * because RDS_MSG_ON_CONN wasn't set, run it again here * taking m_rs_lock is the only thing that keeps us * from racing with ack processing. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } } /* * we only want this to fire once so we use the callers 'queued'. It's * possible that another thread can race with us and remove the * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { unsigned long flags; u32 len; if (*queued) goto out; len = be32_to_cpu(rm->m_inc.i_hdr.h_len); /* this is the only place which holds both the socket's rs_lock * and the connection's c_lock */ spin_lock_irqsave(&rs->rs_lock, flags); /* * If there is a little space in sndbuf, we don't queue anything, * and userspace gets -EAGAIN. But poll() indicates there's send * room. This can lead to bad behavior (spinning) if snd_bytes isn't * freed up by incoming acks. So we check the *old* value of * rs_snd_bytes here to allow the last msg to exceed the buffer, * and poll() now knows no more data can be sent. */ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { rs->rs_snd_bytes += len; /* let recv side know we are close to send space exhaustion. * This is probably not the optimal way to do it, as this * means we set the flag on *all* messages as soon as our * throughput hits a certain threshold. */ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; rm->m_inc.i_conn_path = cp; rds_message_addref(rm); spin_lock(&cp->cp_lock); rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); *queued = 1; } spin_unlock_irqrestore(&rs->rs_lock, flags); out: return *queued; } /* * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ static int rds_rm_size(struct msghdr *msg, int num_sgs, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; bool zcopy_cookie = false; struct rds_iov_vector *iov, *tmp_iov; if (num_sgs < 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (vct->indx >= vct->len) { vct->len += vct->incr; tmp_iov = krealloc(vct->vec, vct->len * sizeof(struct rds_iov_vector), GFP_KERNEL); if (!tmp_iov) { vct->len -= vct->incr; return -ENOMEM; } vct->vec = tmp_iov; } iov = &vct->vec[vct->indx]; memset(iov, 0, sizeof(struct rds_iov_vector)); vct->indx++; cmsg_groups |= 1; retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); if (retval < 0) return retval; size += retval; break; case RDS_CMSG_ZCOPY_COOKIE: zcopy_cookie = true; fallthrough; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; default: return -EINVAL; } } if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) return -EINVAL; size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) return -EINVAL; return size; } static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { u32 *cookie; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || !rm->data.op_mmp_znotifier) return -EINVAL; cookie = CMSG_DATA(cmsg); rm->data.op_mmp_znotifier->z_cookie = *cookie; return 0; } static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int ret = 0, ind = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (ind >= vct->indx) return -ENOMEM; ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); ind++; break; case RDS_CMSG_RDMA_DEST: ret = rds_cmsg_rdma_dest(rs, rm, cmsg); break; case RDS_CMSG_RDMA_MAP: ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; else if (ret == -ENODEV) /* Accommodate the get_mr() case which can fail * if connection isn't established yet. */ ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; case RDS_CMSG_ZCOPY_COOKIE: ret = rds_cmsg_zcopy(rs, rm, cmsg); break; default: return -EINVAL; } if (ret) break; } return ret; } static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn, int nonblock) { int hash; if (conn->c_npaths == 0) hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); else hash = RDS_MPATH_HASH(rs, conn->c_npaths); if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); /* The underlying connection is not up yet. Need to wait * until it is up to be sure that the non-zero c_path can be * used. But if we are interrupted, we have to use the zero * c_path in case the connection ends up being non-MP capable. */ if (conn->c_npaths == 0) { /* Cannot wait for the connection be made, so just use * the base c_path. */ if (nonblock) return 0; if (wait_event_interruptible(conn->c_hs_waitq, conn->c_npaths != 0)) hash = 0; } if (conn->c_npaths == 1) hash = 0; } return hash; } static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))) return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } } return 0; } int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; struct in6_addr daddr; __u32 scope_id = 0; size_t rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; memset(&vct, 0, sizeof(vct)); /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ vct.incr = 1; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } namelen = msg->msg_namelen; if (namelen != 0) { if (namelen < sizeof(*usin)) { ret = -EINVAL; goto out; } switch (usin->sin_family) { case AF_INET: if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(usin->sin_addr.s_addr)) { ret = -EINVAL; goto out; } ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); dport = usin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { int addr_type; if (namelen < sizeof(*sin6)) { ret = -EINVAL; goto out; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EINVAL; goto out; } /* It is a mapped address. Need to do some * sanity checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EINVAL; goto out; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) { ret = -EINVAL; goto out; } scope_id = sin6->sin6_scope_id; } daddr = sin6->sin6_addr; dport = sin6->sin6_port; break; } #endif default: ret = -EINVAL; goto out; } } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); ret = -ENOTCONN; goto out; } else if (namelen != 0) { /* Cannot send to an IPv4 address using an IPv6 source * address and cannot send to an IPv6 address using an * IPv4 source address. */ if (ipv6_addr_v4mapped(&daddr) ^ ipv6_addr_v4mapped(&rs->rs_bound_addr)) { release_sock(sk); ret = -EOPNOTSUPP; goto out; } /* If the socket is already bound to a link local address, * it can only send to peers on the same link. But allow * communicating between link local and non-link local address. */ if (scope_id != rs->rs_bound_scope_id) { if (!scope_id) { scope_id = rs->rs_bound_scope_id; } else if (rs->rs_bound_scope_id) { release_sock(sk); ret = -EINVAL; goto out; } } } release_sock(sk); ret = rds_rdma_bytes(msg, &rdma_payload_len); if (ret) goto out; if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { ret = -EMSGSIZE; goto out; } if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; } if (zcopy) { if (rs->rs_transport->t_type != RDS_TRANS_TCP) { ret = -EOPNOTSUPP; goto out; } num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); } /* size of rm including all sgs */ ret = rds_rm_size(msg, num_sgs, &vct); if (ret < 0) goto out; rm = rds_message_alloc(ret, GFP_KERNEL); if (!rm) { ret = -ENOMEM; goto out; } /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (IS_ERR(rm->data.op_sg)) { ret = PTR_ERR(rm->data.op_sg); goto out; } ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } rm->data.op_active = 1; rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && rs->rs_tos == rs->rs_conn->c_tos) { conn = rs->rs_conn; } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } rs->rs_conn = conn; } if (conn->c_trans->t_mp_capable) cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; else cpath = &conn->c_path[0]; rm->m_conn_path = cpath; /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); if (ret) goto out; if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } if (rds_destroy_pending(conn)) { ret = -EAGAIN; goto out; } if (rds_conn_path_down(cpath)) rds_check_all_paths(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); if (nonblock) { ret = -EAGAIN; goto out; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), timeo); rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; goto out; } /* * By now we've committed to the send. We reuse rds_send_worker() * to retry sends in the rds thread if the transport asks us to. */ rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) { ret = 0; rcu_read_lock(); if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); } if (ret) goto out; rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); return payload_len; out: for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN * or in any other way, we need to destroy the MR again */ if (allocated_mr) rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); if (rm) rds_message_put(rm); return ret; } /* * send out a probe. Can be shared by rds_send_ping, * rds_send_pong, rds_send_hb. * rds_send_hb should use h_flags * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; rds_conn_path_connect_if_down(cp); ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; spin_lock_irqsave(&cp->cp_lock, flags); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); rm->m_inc.i_conn = cp->cp_conn; rm->m_inc.i_conn_path = cp; rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, cp->cp_next_tx_seq); rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, &my_gen_num, sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); return 0; out: if (rm) rds_message_put(rm); return ret; } int rds_send_pong(struct rds_conn_path *cp, __be16 dport) { return rds_send_probe(cp, 0, dport, 0); } void rds_send_ping(struct rds_connection *conn, int cp_index) { unsigned long flags; struct rds_conn_path *cp = &conn->c_path[cp_index]; spin_lock_irqsave(&cp->cp_lock, flags); if (conn->c_ping_triggered) { spin_unlock_irqrestore(&cp->cp_lock, flags); return; } conn->c_ping_triggered = 1; spin_unlock_irqrestore(&cp->cp_lock, flags); rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0); } EXPORT_SYMBOL_GPL(rds_send_ping); |
32 41 18 10 12 2330 4 2327 42 42 32 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/proc_ns.h> #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc init_binfmt_misc = { .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), .enabled = true, .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), }; EXPORT_SYMBOL_GPL(init_binfmt_misc); #endif /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .gid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .projid_map = { .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, }, }, .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif #if IS_ENABLED(CONFIG_BINFMT_MISC) .binfmt_misc = &init_binfmt_misc, #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init); |
17464 17473 356 104 262 378 378 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_add_batch - add several linked entries in batch * @new_first: first entry in batch to be added * @new_last: last entry in batch to be added * @head: the head for your lock-less list * * Return whether list is empty before adding. */ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head) { struct llist_node *first = READ_ONCE(head->first); do { new_last->next = first; } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } EXPORT_SYMBOL_GPL(llist_add_batch); /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); do { if (entry == NULL) return NULL; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_del_first_this - delete given entry of lock-less list if it is first * @head: the head for your lock-less list * @this: a list entry. * * If head of the list is given entry, delete and return %true else * return %false. * * Multiple callers can safely call this concurrently with multiple * llist_add() callers, providing all the callers offer a different @this. */ bool llist_del_first_this(struct llist_head *head, struct llist_node *this) { struct llist_node *entry, *next; /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */ entry = smp_load_acquire(&head->first); do { if (entry != this) return false; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return true; } EXPORT_SYMBOL_GPL(llist_del_first_this); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order); |
7 7 5811 5811 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of device match structures to search in * @dev: the of device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *node, *of_node = dev->of_node; int count, i; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; count = of_property_count_elems_of_size(of_node, "memory-region", sizeof(u32)); /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (count <= 0) { of_node = np; count = of_property_count_elems_of_size( of_node, "memory-region", sizeof(u32)); } for (i = 0; i < count; i++) { node = of_parse_phandle(of_node, "memory-region", i); /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(node, "restricted-dma-pool") && of_device_is_available(node)) { of_node_put(node); break; } of_node_put(node); } /* * Attempt to initialize a restricted-dma-pool region if one was found. * Note that count can hold a negative error code. */ if (i < count && of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 mask, end = 0; bool coherent, set_map = false; int ret; if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { /* Determine the overall bounds of all DMA regions */ end = dma_range_map_max(map); set_map = true; } /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!end && dev->coherent_dma_mask) end = dev->coherent_dma_mask; else if (!end) end = (1ULL << 32) - 1; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (set_map) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); ret = of_iommu_configure(dev, np, id); if (ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (set_map) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } /* Take all other IOMMU errors to mean we'll just carry on without it */ dev_dbg(dev, "device is%sbehind an iommu\n", !ret ? " " : " not "); arch_setup_dma_ops(dev, coherent); if (ret) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias); /** * of_device_make_bus_id - Use the device node data to assign a unique name * @dev: pointer to device structure that is linked to a device tree node * * This routine will first try using the translated bus address to * derive a unique name. If it cannot, then it will prepend names from * parent nodes until a unique name can be derived. */ void of_device_make_bus_id(struct device *dev) { struct device_node *node = dev->of_node; const __be32 *reg; u64 addr; u32 mask; /* Construct the name, using parent nodes if necessary to ensure uniqueness */ while (node->parent) { /* * If the address can be translated, then that is as much * uniqueness as we need. Make it the first component and return */ reg = of_get_property(node, "reg", NULL); if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) { if (!of_property_read_u32(node, "mask", &mask)) dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn", addr, ffs(mask) - 1, node, dev_name(dev)); else dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn", addr, node, dev_name(dev)); return; } /* format arguments only used if dev_name() resolves to NULL */ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s", kbasename(node->full_name), dev_name(dev)); node = node->parent; } } EXPORT_SYMBOL_GPL(of_device_make_bus_id); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 | /* * CoreChip-sz SR9700 one chip USB 1.1 Ethernet Devices * * Author : Liu Junliang <liujunliang_ljl@163.com> * * Based on dm9601.c * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ #include <linux/module.h> #include <linux/sched.h> #include <linux/stddef.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/crc32.h> #include <linux/usb/usbnet.h> #include "sr9700.h" static int sr_read(struct usbnet *dev, u8 reg, u16 length, void *data) { int err; err = usbnet_read_cmd(dev, SR_RD_REGS, SR_REQ_RD_REG, 0, reg, data, length); if ((err != length) && (err >= 0)) err = -EINVAL; return err; } static int sr_write(struct usbnet *dev, u8 reg, u16 length, void *data) { int err; err = usbnet_write_cmd(dev, SR_WR_REGS, SR_REQ_WR_REG, 0, reg, data, length); if ((err >= 0) && (err < length)) err = -EINVAL; return err; } static int sr_read_reg(struct usbnet *dev, u8 reg, u8 *value) { return sr_read(dev, reg, 1, value); } static int sr_write_reg(struct usbnet *dev, u8 reg, u8 value) { return usbnet_write_cmd(dev, SR_WR_REGS, SR_REQ_WR_REG, value, reg, NULL, 0); } static void sr_write_async(struct usbnet *dev, u8 reg, u16 length, const void *data) { usbnet_write_cmd_async(dev, SR_WR_REGS, SR_REQ_WR_REG, 0, reg, data, length); } static void sr_write_reg_async(struct usbnet *dev, u8 reg, u8 value) { usbnet_write_cmd_async(dev, SR_WR_REGS, SR_REQ_WR_REG, value, reg, NULL, 0); } static int wait_phy_eeprom_ready(struct usbnet *dev, int phy) { int i; for (i = 0; i < SR_SHARE_TIMEOUT; i++) { u8 tmp = 0; int ret; udelay(1); ret = sr_read_reg(dev, SR_EPCR, &tmp); if (ret < 0) return ret; /* ready */ if (!(tmp & EPCR_ERRE)) return 0; } netdev_err(dev->net, "%s write timed out!\n", phy ? "phy" : "eeprom"); return -EIO; } static int sr_share_read_word(struct usbnet *dev, int phy, u8 reg, __le16 *value) { int ret; mutex_lock(&dev->phy_mutex); sr_write_reg(dev, SR_EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); sr_write_reg(dev, SR_EPCR, phy ? (EPCR_EPOS | EPCR_ERPRR) : EPCR_ERPRR); ret = wait_phy_eeprom_ready(dev, phy); if (ret < 0) goto out_unlock; sr_write_reg(dev, SR_EPCR, 0x0); ret = sr_read(dev, SR_EPDR, 2, value); netdev_dbg(dev->net, "read shared %d 0x%02x returned 0x%04x, %d\n", phy, reg, *value, ret); out_unlock: mutex_unlock(&dev->phy_mutex); return ret; } static int sr_share_write_word(struct usbnet *dev, int phy, u8 reg, __le16 value) { int ret; mutex_lock(&dev->phy_mutex); ret = sr_write(dev, SR_EPDR, 2, &value); if (ret < 0) goto out_unlock; sr_write_reg(dev, SR_EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); sr_write_reg(dev, SR_EPCR, phy ? (EPCR_WEP | EPCR_EPOS | EPCR_ERPRW) : (EPCR_WEP | EPCR_ERPRW)); ret = wait_phy_eeprom_ready(dev, phy); if (ret < 0) goto out_unlock; sr_write_reg(dev, SR_EPCR, 0x0); out_unlock: mutex_unlock(&dev->phy_mutex); return ret; } static int sr_read_eeprom_word(struct usbnet *dev, u8 offset, void *value) { return sr_share_read_word(dev, 0, offset, value); } static int sr9700_get_eeprom_len(struct net_device *netdev) { return SR_EEPROM_LEN; } static int sr9700_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom, u8 *data) { struct usbnet *dev = netdev_priv(netdev); __le16 *buf = (__le16 *)data; int ret = 0; int i; /* access is 16bit */ if ((eeprom->offset & 0x01) || (eeprom->len & 0x01)) return -EINVAL; for (i = 0; i < eeprom->len / 2; i++) { ret = sr_read_eeprom_word(dev, eeprom->offset / 2 + i, buf + i); if (ret < 0) break; } return ret; } static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) { struct usbnet *dev = netdev_priv(netdev); __le16 res; int rc = 0; int err; if (phy_id) { netdev_dbg(netdev, "Only internal phy supported\n"); return 0; } /* Access NSR_LINKST bit for link status instead of MII_BMSR */ if (loc == MII_BMSR) { u8 value; err = sr_read_reg(dev, SR_NSR, &value); if (err < 0) return err; if (value & NSR_LINKST) rc = 1; } err = sr_share_read_word(dev, 1, loc, &res); if (err < 0) return err; if (rc == 1) res = le16_to_cpu(res) | BMSR_LSTATUS; else res = le16_to_cpu(res) & ~BMSR_LSTATUS; netdev_dbg(netdev, "sr_mdio_read() phy_id=0x%02x, loc=0x%02x, returns=0x%04x\n", phy_id, loc, res); return res; } static void sr_mdio_write(struct net_device *netdev, int phy_id, int loc, int val) { struct usbnet *dev = netdev_priv(netdev); __le16 res = cpu_to_le16(val); if (phy_id) { netdev_dbg(netdev, "Only internal phy supported\n"); return; } netdev_dbg(netdev, "sr_mdio_write() phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", phy_id, loc, val); sr_share_write_word(dev, 1, loc, res); } static u32 sr9700_get_link(struct net_device *netdev) { struct usbnet *dev = netdev_priv(netdev); u8 value = 0; int rc = 0; /* Get the Link Status directly */ sr_read_reg(dev, SR_NSR, &value); if (value & NSR_LINKST) rc = 1; return rc; } static int sr9700_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd) { struct usbnet *dev = netdev_priv(netdev); return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL); } static const struct ethtool_ops sr9700_ethtool_ops = { .get_drvinfo = usbnet_get_drvinfo, .get_link = sr9700_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_eeprom_len = sr9700_get_eeprom_len, .get_eeprom = sr9700_get_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void sr9700_set_multicast(struct net_device *netdev) { struct usbnet *dev = netdev_priv(netdev); /* We use the 20 byte dev->data for our 8 byte filter buffer * to avoid allocating memory that is tricky to free later */ u8 *hashes = (u8 *)&dev->data; /* rx_ctl setting : enable, disable_long, disable_crc */ u8 rx_ctl = RCR_RXEN | RCR_DIS_CRC | RCR_DIS_LONG; memset(hashes, 0x00, SR_MCAST_SIZE); /* broadcast address */ hashes[SR_MCAST_SIZE - 1] |= SR_MCAST_ADDR_FLAG; if (netdev->flags & IFF_PROMISC) { rx_ctl |= RCR_PRMSC; } else if (netdev->flags & IFF_ALLMULTI || netdev_mc_count(netdev) > SR_MCAST_MAX) { rx_ctl |= RCR_RUNT; } else if (!netdev_mc_empty(netdev)) { struct netdev_hw_addr *ha; netdev_for_each_mc_addr(ha, netdev) { u32 crc = ether_crc(ETH_ALEN, ha->addr) >> 26; hashes[crc >> 3] |= 1 << (crc & 0x7); } } sr_write_async(dev, SR_MAR, SR_MCAST_SIZE, hashes); sr_write_reg_async(dev, SR_RCR, rx_ctl); } static int sr9700_set_mac_address(struct net_device *netdev, void *p) { struct usbnet *dev = netdev_priv(netdev); struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) { netdev_err(netdev, "not setting invalid mac address %pM\n", addr->sa_data); return -EINVAL; } eth_hw_addr_set(netdev, addr->sa_data); sr_write_async(dev, SR_PAR, 6, netdev->dev_addr); return 0; } static const struct net_device_ops sr9700_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = sr9700_ioctl, .ndo_set_rx_mode = sr9700_set_multicast, .ndo_set_mac_address = sr9700_set_mac_address, }; static int sr9700_bind(struct usbnet *dev, struct usb_interface *intf) { struct net_device *netdev; struct mii_if_info *mii; u8 addr[ETH_ALEN]; int ret; ret = usbnet_get_endpoints(dev, intf); if (ret) goto out; netdev = dev->net; netdev->netdev_ops = &sr9700_netdev_ops; netdev->ethtool_ops = &sr9700_ethtool_ops; netdev->hard_header_len += SR_TX_OVERHEAD; dev->hard_mtu = netdev->mtu + netdev->hard_header_len; /* bulkin buffer is preferably not less than 3K */ dev->rx_urb_size = 3072; mii = &dev->mii; mii->dev = netdev; mii->mdio_read = sr_mdio_read; mii->mdio_write = sr_mdio_write; mii->phy_id_mask = 0x1f; mii->reg_num_mask = 0x1f; sr_write_reg(dev, SR_NCR, NCR_RST); udelay(20); /* read MAC * After Chip Power on, the Chip will reload the MAC from * EEPROM automatically to PAR. In case there is no EEPROM externally, * a default MAC address is stored in PAR for making chip work properly. */ if (sr_read(dev, SR_PAR, ETH_ALEN, addr) < 0) { netdev_err(netdev, "Error reading MAC address\n"); ret = -ENODEV; goto out; } eth_hw_addr_set(netdev, addr); /* power up and reset phy */ sr_write_reg(dev, SR_PRR, PRR_PHY_RST); /* at least 10ms, here 20ms for safe */ msleep(20); sr_write_reg(dev, SR_PRR, 0); /* at least 1ms, here 2ms for reading right register */ udelay(2 * 1000); /* receive broadcast packets */ sr9700_set_multicast(netdev); sr_mdio_write(netdev, mii->phy_id, MII_BMCR, BMCR_RESET); sr_mdio_write(netdev, mii->phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); mii_nway_restart(mii); out: return ret; } static int sr9700_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct sk_buff *sr_skb; int len; /* skb content (packets) format : * p0 p1 p2 ...... pm * / \ * / \ * / \ * / \ * p0b0 p0b1 p0b2 p0b3 ...... p0b(n-4) p0b(n-3)...p0bn * * p0 : packet 0 * p0b0 : packet 0 byte 0 * * b0: rx status * b1: packet length (incl crc) low * b2: packet length (incl crc) high * b3..n-4: packet data * bn-3..bn: ethernet packet crc */ if (unlikely(skb->len < SR_RX_OVERHEAD)) { netdev_err(dev->net, "unexpected tiny rx frame\n"); return 0; } /* one skb may contains multiple packets */ while (skb->len > SR_RX_OVERHEAD) { if (skb->data[0] != 0x40) return 0; /* ignore the CRC length */ len = (skb->data[1] | (skb->data[2] << 8)) - 4; if (len > ETH_FRAME_LEN || len > skb->len || len < 0) return 0; /* the last packet of current skb */ if (skb->len == (len + SR_RX_OVERHEAD)) { skb_pull(skb, 3); skb->len = len; skb_set_tail_pointer(skb, len); return 2; } sr_skb = netdev_alloc_skb_ip_align(dev->net, len); if (!sr_skb) return 0; skb_put(sr_skb, len); memcpy(sr_skb->data, skb->data + 3, len); usbnet_skb_return(dev, sr_skb); skb_pull(skb, len + SR_RX_OVERHEAD); } return 0; } static struct sk_buff *sr9700_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int len; /* SR9700 can only send out one ethernet packet at once. * * b0 b1 b2 b3 ...... b(n-4) b(n-3)...bn * * b0: rx status * b1: packet length (incl crc) low * b2: packet length (incl crc) high * b3..n-4: packet data * bn-3..bn: ethernet packet crc */ len = skb->len; if (skb_cow_head(skb, SR_TX_OVERHEAD)) { dev_kfree_skb_any(skb); return NULL; } __skb_push(skb, SR_TX_OVERHEAD); /* usbnet adds padding if length is a multiple of packet size * if so, adjust length value in header */ if ((skb->len % dev->maxpacket) == 0) len++; skb->data[0] = len; skb->data[1] = len >> 8; return skb; } static void sr9700_status(struct usbnet *dev, struct urb *urb) { int link; u8 *buf; /* format: b0: net status b1: tx status 1 b2: tx status 2 b3: rx status b4: rx overflow b5: rx count b6: tx count b7: gpr */ if (urb->actual_length < 8) return; buf = urb->transfer_buffer; link = !!(buf[0] & 0x40); if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); netdev_dbg(dev->net, "Link Status is: %d\n", link); } } static int sr9700_link_reset(struct usbnet *dev) { struct ethtool_cmd ecmd; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); netdev_dbg(dev->net, "link_reset() speed: %d duplex: %d\n", ecmd.speed, ecmd.duplex); return 0; } static const struct driver_info sr9700_driver_info = { .description = "CoreChip SR9700 USB Ethernet", .flags = FLAG_ETHER, .bind = sr9700_bind, .rx_fixup = sr9700_rx_fixup, .tx_fixup = sr9700_tx_fixup, .status = sr9700_status, .link_reset = sr9700_link_reset, .reset = sr9700_link_reset, }; static const struct usb_device_id products[] = { { USB_DEVICE(0x0fe6, 0x9700), /* SR9700 device */ .driver_info = (unsigned long)&sr9700_driver_info, }, {}, /* END */ }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver sr9700_usb_driver = { .name = "sr9700", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(sr9700_usb_driver); MODULE_AUTHOR("liujl <liujunliang_ljl@163.com>"); MODULE_DESCRIPTION("SR9700 one chip USB 1.1 USB to Ethernet device from http://www.corechip-sz.com/"); MODULE_LICENSE("GPL"); |
1 6 5 1 1 1 1 6 4 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 | // SPDX-License-Identifier: GPL-2.0 /* * usb-serial driver for Quatech USB 2 devices * * Copyright (C) 2012 Bill Pemberton (wfp5p@virginia.edu) * * These devices all have only 1 bulk in and 1 bulk out that is shared * for all serial ports. * */ #include <asm/unaligned.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/serial_reg.h> #include <linux/uaccess.h> /* default urb timeout for usb operations */ #define QT2_USB_TIMEOUT USB_CTRL_SET_TIMEOUT #define QT_OPEN_CLOSE_CHANNEL 0xca #define QT_SET_GET_DEVICE 0xc2 #define QT_SET_GET_REGISTER 0xc0 #define QT_GET_SET_PREBUF_TRIG_LVL 0xcc #define QT_SET_ATF 0xcd #define QT_TRANSFER_IN 0xc0 #define QT_HW_FLOW_CONTROL_MASK 0xc5 #define QT_SW_FLOW_CONTROL_MASK 0xc6 #define QT2_BREAK_CONTROL 0xc8 #define QT2_GET_SET_UART 0xc1 #define QT2_FLUSH_DEVICE 0xc4 #define QT2_GET_SET_QMCR 0xe1 #define QT2_QMCR_RS232 0x40 #define QT2_QMCR_RS422 0x10 #define SERIAL_CRTSCTS ((UART_MCR_RTS << 8) | UART_MSR_CTS) #define SERIAL_EVEN_PARITY (UART_LCR_PARITY | UART_LCR_EPAR) /* status bytes for the device */ #define QT2_CONTROL_BYTE 0x1b #define QT2_LINE_STATUS 0x00 /* following 1 byte is line status */ #define QT2_MODEM_STATUS 0x01 /* following 1 byte is modem status */ #define QT2_XMIT_HOLD 0x02 /* following 2 bytes are ?? */ #define QT2_CHANGE_PORT 0x03 /* following 1 byte is port to change to */ #define QT2_REC_FLUSH 0x04 /* no following info */ #define QT2_XMIT_FLUSH 0x05 /* no following info */ #define QT2_CONTROL_ESCAPE 0xff /* pass through previous 2 control bytes */ #define MAX_BAUD_RATE 921600 #define DEFAULT_BAUD_RATE 9600 #define QT2_READ_BUFFER_SIZE 512 /* size of read buffer */ #define QT2_WRITE_BUFFER_SIZE 512 /* size of write buffer */ #define QT2_WRITE_CONTROL_SIZE 5 /* control bytes used for a write */ #define DRIVER_DESC "Quatech 2nd gen USB to Serial Driver" #define USB_VENDOR_ID_QUATECH 0x061d #define QUATECH_SSU2_100 0xC120 /* RS232 single port */ #define QUATECH_DSU2_100 0xC140 /* RS232 dual port */ #define QUATECH_DSU2_400 0xC150 /* RS232/422/485 dual port */ #define QUATECH_QSU2_100 0xC160 /* RS232 four port */ #define QUATECH_QSU2_400 0xC170 /* RS232/422/485 four port */ #define QUATECH_ESU2_100 0xC1A0 /* RS232 eight port */ #define QUATECH_ESU2_400 0xC180 /* RS232/422/485 eight port */ struct qt2_device_detail { int product_id; int num_ports; }; #define QT_DETAILS(prod, ports) \ .product_id = (prod), \ .num_ports = (ports) static const struct qt2_device_detail qt2_device_details[] = { {QT_DETAILS(QUATECH_SSU2_100, 1)}, {QT_DETAILS(QUATECH_DSU2_400, 2)}, {QT_DETAILS(QUATECH_DSU2_100, 2)}, {QT_DETAILS(QUATECH_QSU2_400, 4)}, {QT_DETAILS(QUATECH_QSU2_100, 4)}, {QT_DETAILS(QUATECH_ESU2_400, 8)}, {QT_DETAILS(QUATECH_ESU2_100, 8)}, {QT_DETAILS(0, 0)} /* Terminating entry */ }; static const struct usb_device_id id_table[] = { {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_SSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_DSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_DSU2_400)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_QSU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_QSU2_400)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_ESU2_100)}, {USB_DEVICE(USB_VENDOR_ID_QUATECH, QUATECH_ESU2_400)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); struct qt2_serial_private { unsigned char current_port; /* current port for incoming data */ struct urb *read_urb; /* shared among all ports */ char *read_buffer; }; struct qt2_port_private { u8 device_port; spinlock_t urb_lock; bool urb_in_use; struct urb *write_urb; char *write_buffer; spinlock_t lock; u8 shadowLSR; u8 shadowMSR; struct usb_serial_port *port; }; static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch); static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch); static void qt2_write_bulk_callback(struct urb *urb); static void qt2_read_bulk_callback(struct urb *urb); static void qt2_release(struct usb_serial *serial) { struct qt2_serial_private *serial_priv; serial_priv = usb_get_serial_data(serial); usb_kill_urb(serial_priv->read_urb); usb_free_urb(serial_priv->read_urb); kfree(serial_priv->read_buffer); kfree(serial_priv); } static inline int calc_baud_divisor(int baudrate) { int divisor, rem; divisor = MAX_BAUD_RATE / baudrate; rem = MAX_BAUD_RATE % baudrate; /* Round to nearest divisor */ if (((rem * 2) >= baudrate) && (baudrate != 110)) divisor++; return divisor; } static inline int qt2_set_port_config(struct usb_device *dev, unsigned char port_number, u16 baudrate, u16 lcr) { int divisor = calc_baud_divisor(baudrate); u16 index = ((u16) (lcr << 8) | (u16) (port_number)); return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), QT2_GET_SET_UART, 0x40, divisor, index, NULL, 0, QT2_USB_TIMEOUT); } static inline int qt2_control_msg(struct usb_device *dev, u8 request, u16 data, u16 index) { return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), request, 0x40, data, index, NULL, 0, QT2_USB_TIMEOUT); } static inline int qt2_getregister(struct usb_device *dev, u8 uart, u8 reg, u8 *data) { int ret; ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0xc0, reg, uart, data, sizeof(*data), QT2_USB_TIMEOUT); if (ret < (int)sizeof(*data)) { if (ret >= 0) ret = -EIO; } return ret; } static inline int qt2_setregister(struct usb_device *dev, u8 uart, u8 reg, u16 data) { u16 value = (data << 8) | reg; return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), QT_SET_GET_REGISTER, 0x40, value, uart, NULL, 0, QT2_USB_TIMEOUT); } static inline int update_mctrl(struct qt2_port_private *port_priv, unsigned int set, unsigned int clear) { struct usb_serial_port *port = port_priv->port; struct usb_device *dev = port->serial->dev; unsigned urb_value; int status; if (((set | clear) & (TIOCM_DTR | TIOCM_RTS)) == 0) { dev_dbg(&port->dev, "update_mctrl - DTR|RTS not being set|cleared\n"); return 0; /* no change */ } clear &= ~set; /* 'set' takes precedence over 'clear' */ urb_value = 0; if (set & TIOCM_DTR) urb_value |= UART_MCR_DTR; if (set & TIOCM_RTS) urb_value |= UART_MCR_RTS; status = qt2_setregister(dev, port_priv->device_port, UART_MCR, urb_value); if (status < 0) dev_err(&port->dev, "update_mctrl - Error from MODEM_CTRL urb: %i\n", status); return status; } static int qt2_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { struct qt2_device_detail d; int i; for (i = 0; d = qt2_device_details[i], d.product_id != 0; i++) { if (d.product_id == le16_to_cpu(serial->dev->descriptor.idProduct)) return d.num_ports; } /* we didn't recognize the device */ dev_err(&serial->dev->dev, "don't know the number of ports, assuming 1\n"); return 1; } static void qt2_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv; struct ktermios *termios = &tty->termios; u16 baud; unsigned int cflag = termios->c_cflag; u16 new_lcr = 0; int status; port_priv = usb_get_serial_port_data(port); if (cflag & PARENB) { if (cflag & PARODD) new_lcr |= UART_LCR_PARITY; else new_lcr |= SERIAL_EVEN_PARITY; } new_lcr |= UART_LCR_WLEN(tty_get_char_size(cflag)); baud = tty_get_baud_rate(tty); if (!baud) baud = 9600; status = qt2_set_port_config(dev, port_priv->device_port, baud, new_lcr); if (status < 0) dev_err(&port->dev, "%s - qt2_set_port_config failed: %i\n", __func__, status); if (cflag & CRTSCTS) status = qt2_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, SERIAL_CRTSCTS, port_priv->device_port); else status = qt2_control_msg(dev, QT_HW_FLOW_CONTROL_MASK, 0, port_priv->device_port); if (status < 0) dev_err(&port->dev, "%s - set HW flow control failed: %i\n", __func__, status); if (I_IXOFF(tty) || I_IXON(tty)) { u16 x = ((u16) (START_CHAR(tty) << 8) | (u16) (STOP_CHAR(tty))); status = qt2_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, x, port_priv->device_port); } else status = qt2_control_msg(dev, QT_SW_FLOW_CONTROL_MASK, 0, port_priv->device_port); if (status < 0) dev_err(&port->dev, "%s - set SW flow control failed: %i\n", __func__, status); } static int qt2_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_serial *serial; struct qt2_port_private *port_priv; u8 *data; u16 device_port; int status; unsigned long flags; device_port = port->port_number; serial = port->serial; port_priv = usb_get_serial_port_data(port); /* set the port to RS232 mode */ status = qt2_control_msg(serial->dev, QT2_GET_SET_QMCR, QT2_QMCR_RS232, device_port); if (status < 0) { dev_err(&port->dev, "%s failed to set RS232 mode for port %i error %i\n", __func__, device_port, status); return status; } data = kzalloc(2, GFP_KERNEL); if (!data) return -ENOMEM; /* open the port */ status = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), QT_OPEN_CLOSE_CHANNEL, 0xc0, 0, device_port, data, 2, QT2_USB_TIMEOUT); if (status < 2) { dev_err(&port->dev, "%s - open port failed %i\n", __func__, status); if (status >= 0) status = -EIO; kfree(data); return status; } spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowLSR = data[0]; port_priv->shadowMSR = data[1]; spin_unlock_irqrestore(&port_priv->lock, flags); kfree(data); /* set to default speed and 8bit word size */ status = qt2_set_port_config(serial->dev, device_port, DEFAULT_BAUD_RATE, UART_LCR_WLEN8); if (status < 0) { dev_err(&port->dev, "%s - initial setup failed (%i)\n", __func__, device_port); return status; } port_priv->device_port = (u8) device_port; if (tty) qt2_set_termios(tty, port, &tty->termios); return 0; } static void qt2_close(struct usb_serial_port *port) { struct usb_serial *serial; struct qt2_port_private *port_priv; int i; serial = port->serial; port_priv = usb_get_serial_port_data(port); usb_kill_urb(port_priv->write_urb); /* flush the port transmit buffer */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT2_FLUSH_DEVICE, 0x40, 1, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - transmit buffer flush failed: %i\n", __func__, i); /* flush the port receive buffer */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT2_FLUSH_DEVICE, 0x40, 0, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - receive buffer flush failed: %i\n", __func__, i); /* close the port */ i = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), QT_OPEN_CLOSE_CHANNEL, 0x40, 0, port_priv->device_port, NULL, 0, QT2_USB_TIMEOUT); if (i < 0) dev_err(&port->dev, "%s - close port failed %i\n", __func__, i); } static void qt2_disconnect(struct usb_serial *serial) { struct qt2_serial_private *serial_priv = usb_get_serial_data(serial); usb_kill_urb(serial_priv->read_urb); } static void qt2_process_status(struct usb_serial_port *port, unsigned char *ch) { switch (*ch) { case QT2_LINE_STATUS: qt2_update_lsr(port, ch + 1); break; case QT2_MODEM_STATUS: qt2_update_msr(port, ch + 1); break; } } static void qt2_process_read_urb(struct urb *urb) { struct usb_serial *serial; struct qt2_serial_private *serial_priv; struct usb_serial_port *port; bool escapeflag; unsigned char *ch; int i; unsigned char newport; int len = urb->actual_length; if (!len) return; ch = urb->transfer_buffer; serial = urb->context; serial_priv = usb_get_serial_data(serial); port = serial->port[serial_priv->current_port]; for (i = 0; i < urb->actual_length; i++) { ch = (unsigned char *)urb->transfer_buffer + i; if ((i <= (len - 3)) && (*ch == QT2_CONTROL_BYTE) && (*(ch + 1) == QT2_CONTROL_BYTE)) { escapeflag = false; switch (*(ch + 2)) { case QT2_LINE_STATUS: case QT2_MODEM_STATUS: if (i > (len - 4)) { dev_warn(&port->dev, "%s - status message too short\n", __func__); break; } qt2_process_status(port, ch + 2); i += 3; escapeflag = true; break; case QT2_XMIT_HOLD: if (i > (len - 5)) { dev_warn(&port->dev, "%s - xmit_empty message too short\n", __func__); break; } /* bytes_written = (ch[1] << 4) + ch[0]; */ i += 4; escapeflag = true; break; case QT2_CHANGE_PORT: if (i > (len - 4)) { dev_warn(&port->dev, "%s - change_port message too short\n", __func__); break; } tty_flip_buffer_push(&port->port); newport = *(ch + 3); if (newport > serial->num_ports) { dev_err(&port->dev, "%s - port change to invalid port: %i\n", __func__, newport); break; } serial_priv->current_port = newport; port = serial->port[serial_priv->current_port]; i += 3; escapeflag = true; break; case QT2_REC_FLUSH: case QT2_XMIT_FLUSH: i += 2; escapeflag = true; break; case QT2_CONTROL_ESCAPE: tty_insert_flip_string(&port->port, ch, 2); i += 2; escapeflag = true; break; default: dev_warn(&port->dev, "%s - unsupported command %i\n", __func__, *(ch + 2)); break; } if (escapeflag) continue; } tty_insert_flip_char(&port->port, *ch, TTY_NORMAL); } tty_flip_buffer_push(&port->port); } static void qt2_write_bulk_callback(struct urb *urb) { struct usb_serial_port *port; struct qt2_port_private *port_priv; unsigned long flags; port = urb->context; port_priv = usb_get_serial_port_data(port); spin_lock_irqsave(&port_priv->urb_lock, flags); port_priv->urb_in_use = false; usb_serial_port_softint(port); spin_unlock_irqrestore(&port_priv->urb_lock, flags); } static void qt2_read_bulk_callback(struct urb *urb) { struct usb_serial *serial = urb->context; int status; if (urb->status) { dev_warn(&serial->dev->dev, "%s - non-zero urb status: %i\n", __func__, urb->status); return; } qt2_process_read_urb(urb); status = usb_submit_urb(urb, GFP_ATOMIC); if (status != 0) dev_err(&serial->dev->dev, "%s - resubmit read urb failed: %i\n", __func__, status); } static int qt2_setup_urbs(struct usb_serial *serial) { struct usb_serial_port *port0; struct qt2_serial_private *serial_priv; int status; port0 = serial->port[0]; serial_priv = usb_get_serial_data(serial); serial_priv->read_urb = usb_alloc_urb(0, GFP_KERNEL); if (!serial_priv->read_urb) return -ENOMEM; usb_fill_bulk_urb(serial_priv->read_urb, serial->dev, usb_rcvbulkpipe(serial->dev, port0->bulk_in_endpointAddress), serial_priv->read_buffer, QT2_READ_BUFFER_SIZE, qt2_read_bulk_callback, serial); status = usb_submit_urb(serial_priv->read_urb, GFP_KERNEL); if (status != 0) { dev_err(&serial->dev->dev, "%s - submit read urb failed %i\n", __func__, status); usb_free_urb(serial_priv->read_urb); return status; } return 0; } static int qt2_attach(struct usb_serial *serial) { struct qt2_serial_private *serial_priv; int status; /* power on unit */ status = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 0xc2, 0x40, 0x8000, 0, NULL, 0, QT2_USB_TIMEOUT); if (status < 0) { dev_err(&serial->dev->dev, "%s - failed to power on unit: %i\n", __func__, status); return status; } serial_priv = kzalloc(sizeof(*serial_priv), GFP_KERNEL); if (!serial_priv) return -ENOMEM; serial_priv->read_buffer = kmalloc(QT2_READ_BUFFER_SIZE, GFP_KERNEL); if (!serial_priv->read_buffer) { status = -ENOMEM; goto err_buf; } usb_set_serial_data(serial, serial_priv); status = qt2_setup_urbs(serial); if (status != 0) goto attach_failed; return 0; attach_failed: kfree(serial_priv->read_buffer); err_buf: kfree(serial_priv); return status; } static int qt2_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct qt2_port_private *port_priv; u8 bEndpointAddress; port_priv = kzalloc(sizeof(*port_priv), GFP_KERNEL); if (!port_priv) return -ENOMEM; spin_lock_init(&port_priv->lock); spin_lock_init(&port_priv->urb_lock); port_priv->port = port; port_priv->write_buffer = kmalloc(QT2_WRITE_BUFFER_SIZE, GFP_KERNEL); if (!port_priv->write_buffer) goto err_buf; port_priv->write_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port_priv->write_urb) goto err_urb; bEndpointAddress = serial->port[0]->bulk_out_endpointAddress; usb_fill_bulk_urb(port_priv->write_urb, serial->dev, usb_sndbulkpipe(serial->dev, bEndpointAddress), port_priv->write_buffer, QT2_WRITE_BUFFER_SIZE, qt2_write_bulk_callback, port); usb_set_serial_port_data(port, port_priv); return 0; err_urb: kfree(port_priv->write_buffer); err_buf: kfree(port_priv); return -ENOMEM; } static void qt2_port_remove(struct usb_serial_port *port) { struct qt2_port_private *port_priv; port_priv = usb_get_serial_port_data(port); usb_free_urb(port_priv->write_urb); kfree(port_priv->write_buffer); kfree(port_priv); } static int qt2_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv = usb_get_serial_port_data(port); u8 *d; int r; d = kzalloc(2, GFP_KERNEL); if (!d) return -ENOMEM; r = qt2_getregister(dev, port_priv->device_port, UART_MCR, d); if (r < 0) goto mget_out; r = qt2_getregister(dev, port_priv->device_port, UART_MSR, d + 1); if (r < 0) goto mget_out; r = (d[0] & UART_MCR_DTR ? TIOCM_DTR : 0) | (d[0] & UART_MCR_RTS ? TIOCM_RTS : 0) | (d[1] & UART_MSR_CTS ? TIOCM_CTS : 0) | (d[1] & UART_MSR_DCD ? TIOCM_CAR : 0) | (d[1] & UART_MSR_RI ? TIOCM_RI : 0) | (d[1] & UART_MSR_DSR ? TIOCM_DSR : 0); mget_out: kfree(d); return r; } static int qt2_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct qt2_port_private *port_priv; port_priv = usb_get_serial_port_data(tty->driver_data); return update_mctrl(port_priv, set, clear); } static int qt2_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct qt2_port_private *port_priv; int status; u16 val; port_priv = usb_get_serial_port_data(port); val = (break_state == -1) ? 1 : 0; status = qt2_control_msg(port->serial->dev, QT2_BREAK_CONTROL, val, port_priv->device_port); if (status < 0) { dev_warn(&port->dev, "%s - failed to send control message: %i\n", __func__, status); return status; } return 0; } static void qt2_dtr_rts(struct usb_serial_port *port, int on) { struct usb_device *dev = port->serial->dev; struct qt2_port_private *port_priv = usb_get_serial_port_data(port); /* Disable flow control */ if (!on) { if (qt2_setregister(dev, port_priv->device_port, UART_MCR, 0) < 0) dev_warn(&port->dev, "error from flowcontrol urb\n"); } /* drop RTS and DTR */ if (on) update_mctrl(port_priv, TIOCM_DTR | TIOCM_RTS, 0); else update_mctrl(port_priv, 0, TIOCM_DTR | TIOCM_RTS); } static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch) { struct qt2_port_private *port_priv; u8 newMSR = (u8) *ch; unsigned long flags; /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); if (!port_priv) return; spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowMSR = newMSR; spin_unlock_irqrestore(&port_priv->lock, flags); if (newMSR & UART_MSR_ANY_DELTA) { /* update input line counters */ if (newMSR & UART_MSR_DCTS) port->icount.cts++; if (newMSR & UART_MSR_DDSR) port->icount.dsr++; if (newMSR & UART_MSR_DDCD) port->icount.dcd++; if (newMSR & UART_MSR_TERI) port->icount.rng++; wake_up_interruptible(&port->port.delta_msr_wait); } } static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch) { struct qt2_port_private *port_priv; struct async_icount *icount; unsigned long flags; u8 newLSR = (u8) *ch; /* May be called from qt2_process_read_urb() for an unbound port. */ port_priv = usb_get_serial_port_data(port); if (!port_priv) return; if (newLSR & UART_LSR_BI) newLSR &= (u8) (UART_LSR_OE | UART_LSR_BI); spin_lock_irqsave(&port_priv->lock, flags); port_priv->shadowLSR = newLSR; spin_unlock_irqrestore(&port_priv->lock, flags); icount = &port->icount; if (newLSR & UART_LSR_BRK_ERROR_BITS) { if (newLSR & UART_LSR_BI) icount->brk++; if (newLSR & UART_LSR_OE) icount->overrun++; if (newLSR & UART_LSR_PE) icount->parity++; if (newLSR & UART_LSR_FE) icount->frame++; } } static unsigned int qt2_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct qt2_port_private *port_priv; unsigned long flags; unsigned int r; port_priv = usb_get_serial_port_data(port); spin_lock_irqsave(&port_priv->urb_lock, flags); if (port_priv->urb_in_use) r = 0; else r = QT2_WRITE_BUFFER_SIZE - QT2_WRITE_CONTROL_SIZE; spin_unlock_irqrestore(&port_priv->urb_lock, flags); return r; } static int qt2_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct qt2_port_private *port_priv; struct urb *write_urb; unsigned char *data; unsigned long flags; int status; int bytes_out = 0; port_priv = usb_get_serial_port_data(port); if (port_priv->write_urb == NULL) { dev_err(&port->dev, "%s - no output urb\n", __func__); return 0; } write_urb = port_priv->write_urb; count = min(count, QT2_WRITE_BUFFER_SIZE - QT2_WRITE_CONTROL_SIZE); data = write_urb->transfer_buffer; spin_lock_irqsave(&port_priv->urb_lock, flags); if (port_priv->urb_in_use) { dev_err(&port->dev, "qt2_write - urb is in use\n"); goto write_out; } *data++ = QT2_CONTROL_BYTE; *data++ = QT2_CONTROL_BYTE; *data++ = port_priv->device_port; put_unaligned_le16(count, data); data += 2; memcpy(data, buf, count); write_urb->transfer_buffer_length = count + QT2_WRITE_CONTROL_SIZE; status = usb_submit_urb(write_urb, GFP_ATOMIC); if (status == 0) { port_priv->urb_in_use = true; bytes_out += count; } write_out: spin_unlock_irqrestore(&port_priv->urb_lock, flags); return bytes_out; } static struct usb_serial_driver qt2_device = { .driver = { .owner = THIS_MODULE, .name = "quatech-serial", }, .description = DRIVER_DESC, .id_table = id_table, .open = qt2_open, .close = qt2_close, .write = qt2_write, .write_room = qt2_write_room, .calc_num_ports = qt2_calc_num_ports, .attach = qt2_attach, .release = qt2_release, .disconnect = qt2_disconnect, .port_probe = qt2_port_probe, .port_remove = qt2_port_remove, .dtr_rts = qt2_dtr_rts, .break_ctl = qt2_break_ctl, .tiocmget = qt2_tiocmget, .tiocmset = qt2_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, .set_termios = qt2_set_termios, }; static struct usb_serial_driver *const serial_drivers[] = { &qt2_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2"); |
109 58 191 69 196 49 35 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM power #if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_POWER_H #include <linux/cpufreq.h> #include <linux/ktime.h> #include <linux/pm_qos.h> #include <linux/tracepoint.h> #include <linux/trace_events.h> #define TPS(x) tracepoint_string(x) DECLARE_EVENT_CLASS(cpu, TP_PROTO(unsigned int state, unsigned int cpu_id), TP_ARGS(state, cpu_id), TP_STRUCT__entry( __field( u32, state ) __field( u32, cpu_id ) ), TP_fast_assign( __entry->state = state; __entry->cpu_id = cpu_id; ), TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) ); DEFINE_EVENT(cpu, cpu_idle, TP_PROTO(unsigned int state, unsigned int cpu_id), TP_ARGS(state, cpu_id) ); TRACE_EVENT(cpu_idle_miss, TP_PROTO(unsigned int cpu_id, unsigned int state, bool below), TP_ARGS(cpu_id, state, below), TP_STRUCT__entry( __field(u32, cpu_id) __field(u32, state) __field(bool, below) ), TP_fast_assign( __entry->cpu_id = cpu_id; __entry->state = state; __entry->below = below; ), TP_printk("cpu_id=%lu state=%lu type=%s", (unsigned long)__entry->cpu_id, (unsigned long)__entry->state, (__entry->below)?"below":"above") ); TRACE_EVENT(powernv_throttle, TP_PROTO(int chip_id, const char *reason, int pmax), TP_ARGS(chip_id, reason, pmax), TP_STRUCT__entry( __field(int, chip_id) __string(reason, reason) __field(int, pmax) ), TP_fast_assign( __entry->chip_id = chip_id; __assign_str(reason); __entry->pmax = pmax; ), TP_printk("Chip %d Pmax %d %s", __entry->chip_id, __entry->pmax, __get_str(reason)) ); TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, u32 scaled_busy, u32 from, u32 to, u64 mperf, u64 aperf, u64 tsc, u32 freq, u32 io_boost ), TP_ARGS(core_busy, scaled_busy, from, to, mperf, aperf, tsc, freq, io_boost ), TP_STRUCT__entry( __field(u32, core_busy) __field(u32, scaled_busy) __field(u32, from) __field(u32, to) __field(u64, mperf) __field(u64, aperf) __field(u64, tsc) __field(u32, freq) __field(u32, io_boost) ), TP_fast_assign( __entry->core_busy = core_busy; __entry->scaled_busy = scaled_busy; __entry->from = from; __entry->to = to; __entry->mperf = mperf; __entry->aperf = aperf; __entry->tsc = tsc; __entry->freq = freq; __entry->io_boost = io_boost; ), TP_printk("core_busy=%lu scaled=%lu from=%lu to=%lu mperf=%llu aperf=%llu tsc=%llu freq=%lu io_boost=%lu", (unsigned long)__entry->core_busy, (unsigned long)__entry->scaled_busy, (unsigned long)__entry->from, (unsigned long)__entry->to, (unsigned long long)__entry->mperf, (unsigned long long)__entry->aperf, (unsigned long long)__entry->tsc, (unsigned long)__entry->freq, (unsigned long)__entry->io_boost ) ); /* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */ #ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING #define _PWR_EVENT_AVOID_DOUBLE_DEFINING #define PWR_EVENT_EXIT -1 #endif #define pm_verb_symbolic(event) \ __print_symbolic(event, \ { PM_EVENT_SUSPEND, "suspend" }, \ { PM_EVENT_RESUME, "resume" }, \ { PM_EVENT_FREEZE, "freeze" }, \ { PM_EVENT_QUIESCE, "quiesce" }, \ { PM_EVENT_HIBERNATE, "hibernate" }, \ { PM_EVENT_THAW, "thaw" }, \ { PM_EVENT_RESTORE, "restore" }, \ { PM_EVENT_RECOVER, "recover" }) DEFINE_EVENT(cpu, cpu_frequency, TP_PROTO(unsigned int frequency, unsigned int cpu_id), TP_ARGS(frequency, cpu_id) ); TRACE_EVENT(cpu_frequency_limits, TP_PROTO(struct cpufreq_policy *policy), TP_ARGS(policy), TP_STRUCT__entry( __field(u32, min_freq) __field(u32, max_freq) __field(u32, cpu_id) ), TP_fast_assign( __entry->min_freq = policy->min; __entry->max_freq = policy->max; __entry->cpu_id = policy->cpu; ), TP_printk("min=%lu max=%lu cpu_id=%lu", (unsigned long)__entry->min_freq, (unsigned long)__entry->max_freq, (unsigned long)__entry->cpu_id) ); TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), TP_ARGS(dev, pm_ops, event), TP_STRUCT__entry( __string(device, dev_name(dev)) __string(driver, dev_driver_string(dev)) __string(parent, dev->parent ? dev_name(dev->parent) : "none") __string(pm_ops, pm_ops ? pm_ops : "none ") __field(int, event) ), TP_fast_assign( __assign_str(device); __assign_str(driver); __assign_str(parent); __assign_str(pm_ops); __entry->event = event; ), TP_printk("%s %s, parent: %s, %s[%s]", __get_str(driver), __get_str(device), __get_str(parent), __get_str(pm_ops), pm_verb_symbolic(__entry->event)) ); TRACE_EVENT(device_pm_callback_end, TP_PROTO(struct device *dev, int error), TP_ARGS(dev, error), TP_STRUCT__entry( __string(device, dev_name(dev)) __string(driver, dev_driver_string(dev)) __field(int, error) ), TP_fast_assign( __assign_str(device); __assign_str(driver); __entry->error = error; ), TP_printk("%s %s, err=%d", __get_str(driver), __get_str(device), __entry->error) ); TRACE_EVENT(suspend_resume, TP_PROTO(const char *action, int val, bool start), TP_ARGS(action, val, start), TP_STRUCT__entry( __field(const char *, action) __field(int, val) __field(bool, start) ), TP_fast_assign( __entry->action = action; __entry->val = val; __entry->start = start; ), TP_printk("%s[%u] %s", __entry->action, (unsigned int)__entry->val, (__entry->start)?"begin":"end") ); DECLARE_EVENT_CLASS(wakeup_source, TP_PROTO(const char *name, unsigned int state), TP_ARGS(name, state), TP_STRUCT__entry( __string( name, name ) __field( u64, state ) ), TP_fast_assign( __assign_str(name); __entry->state = state; ), TP_printk("%s state=0x%lx", __get_str(name), (unsigned long)__entry->state) ); DEFINE_EVENT(wakeup_source, wakeup_source_activate, TP_PROTO(const char *name, unsigned int state), TP_ARGS(name, state) ); DEFINE_EVENT(wakeup_source, wakeup_source_deactivate, TP_PROTO(const char *name, unsigned int state), TP_ARGS(name, state) ); /* * The clock events are used for clock enable/disable and for * clock rate change */ DECLARE_EVENT_CLASS(clock, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id), TP_STRUCT__entry( __string( name, name ) __field( u64, state ) __field( u64, cpu_id ) ), TP_fast_assign( __assign_str(name); __entry->state = state; __entry->cpu_id = cpu_id; ), TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) ); DEFINE_EVENT(clock, clock_enable, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id) ); DEFINE_EVENT(clock, clock_disable, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id) ); DEFINE_EVENT(clock, clock_set_rate, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id) ); /* * The power domain events are used for power domains transitions */ DECLARE_EVENT_CLASS(power_domain, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id), TP_STRUCT__entry( __string( name, name ) __field( u64, state ) __field( u64, cpu_id ) ), TP_fast_assign( __assign_str(name); __entry->state = state; __entry->cpu_id = cpu_id; ), TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) ); DEFINE_EVENT(power_domain, power_domain_target, TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), TP_ARGS(name, state, cpu_id) ); /* * CPU latency QoS events used for global CPU latency QoS list updates */ DECLARE_EVENT_CLASS(cpu_latency_qos_request, TP_PROTO(s32 value), TP_ARGS(value), TP_STRUCT__entry( __field( s32, value ) ), TP_fast_assign( __entry->value = value; ), TP_printk("CPU_DMA_LATENCY value=%d", __entry->value) ); DEFINE_EVENT(cpu_latency_qos_request, pm_qos_add_request, TP_PROTO(s32 value), TP_ARGS(value) ); DEFINE_EVENT(cpu_latency_qos_request, pm_qos_update_request, TP_PROTO(s32 value), TP_ARGS(value) ); DEFINE_EVENT(cpu_latency_qos_request, pm_qos_remove_request, TP_PROTO(s32 value), TP_ARGS(value) ); /* * General PM QoS events used for updates of PM QoS request lists */ DECLARE_EVENT_CLASS(pm_qos_update, TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value), TP_ARGS(action, prev_value, curr_value), TP_STRUCT__entry( __field( enum pm_qos_req_action, action ) __field( int, prev_value ) __field( int, curr_value ) ), TP_fast_assign( __entry->action = action; __entry->prev_value = prev_value; __entry->curr_value = curr_value; ), TP_printk("action=%s prev_value=%d curr_value=%d", __print_symbolic(__entry->action, { PM_QOS_ADD_REQ, "ADD_REQ" }, { PM_QOS_UPDATE_REQ, "UPDATE_REQ" }, { PM_QOS_REMOVE_REQ, "REMOVE_REQ" }), __entry->prev_value, __entry->curr_value) ); DEFINE_EVENT(pm_qos_update, pm_qos_update_target, TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value), TP_ARGS(action, prev_value, curr_value) ); DEFINE_EVENT_PRINT(pm_qos_update, pm_qos_update_flags, TP_PROTO(enum pm_qos_req_action action, int prev_value, int curr_value), TP_ARGS(action, prev_value, curr_value), TP_printk("action=%s prev_value=0x%x curr_value=0x%x", __print_symbolic(__entry->action, { PM_QOS_ADD_REQ, "ADD_REQ" }, { PM_QOS_UPDATE_REQ, "UPDATE_REQ" }, { PM_QOS_REMOVE_REQ, "REMOVE_REQ" }), __entry->prev_value, __entry->curr_value) ); DECLARE_EVENT_CLASS(dev_pm_qos_request, TP_PROTO(const char *name, enum dev_pm_qos_req_type type, s32 new_value), TP_ARGS(name, type, new_value), TP_STRUCT__entry( __string( name, name ) __field( enum dev_pm_qos_req_type, type ) __field( s32, new_value ) ), TP_fast_assign( __assign_str(name); __entry->type = type; __entry->new_value = new_value; ), TP_printk("device=%s type=%s new_value=%d", __get_str(name), __print_symbolic(__entry->type, { DEV_PM_QOS_RESUME_LATENCY, "DEV_PM_QOS_RESUME_LATENCY" }, { DEV_PM_QOS_FLAGS, "DEV_PM_QOS_FLAGS" }), __entry->new_value) ); DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_add_request, TP_PROTO(const char *name, enum dev_pm_qos_req_type type, s32 new_value), TP_ARGS(name, type, new_value) ); DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_update_request, TP_PROTO(const char *name, enum dev_pm_qos_req_type type, s32 new_value), TP_ARGS(name, type, new_value) ); DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_remove_request, TP_PROTO(const char *name, enum dev_pm_qos_req_type type, s32 new_value), TP_ARGS(name, type, new_value) ); TRACE_EVENT(guest_halt_poll_ns, TP_PROTO(bool grow, unsigned int new, unsigned int old), TP_ARGS(grow, new, old), TP_STRUCT__entry( __field(bool, grow) __field(unsigned int, new) __field(unsigned int, old) ), TP_fast_assign( __entry->grow = grow; __entry->new = new; __entry->old = old; ), TP_printk("halt_poll_ns %u (%s %u)", __entry->new, __entry->grow ? "grow" : "shrink", __entry->old) ); #define trace_guest_halt_poll_ns_grow(new, old) \ trace_guest_halt_poll_ns(true, new, old) #define trace_guest_halt_poll_ns_shrink(new, old) \ trace_guest_halt_poll_ns(false, new, old) #endif /* _TRACE_POWER_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
4 16 16 1 1 3 1 1 1 4 4 5 4 16 5 15 16 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | // SPDX-License-Identifier: GPL-2.0 /* * Key setup for v1 encryption policies * * Copyright 2015, 2019 Google LLC */ /* * This file implements compatibility functions for the original encryption * policy version ("v1"), including: * * - Deriving per-file encryption keys using the AES-128-ECB based KDF * (rather than the new method of using HKDF-SHA512) * * - Retrieving fscrypt master keys from process-subscribed keyrings * (rather than the new method of using a filesystem-level keyring) * * - Handling policies with the DIRECT_KEY flag set using a master key table * (rather than the new method of implementing DIRECT_KEY with per-mode keys * managed alongside the master keys in the filesystem-level keyring) */ #include <crypto/skcipher.h> #include <crypto/utils.h> #include <keys/user-type.h> #include <linux/hashtable.h> #include <linux/scatterlist.h> #include "fscrypt_private.h" /* Table of keys referenced by DIRECT_KEY policies */ static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); /* * v1 key derivation function. This generates the derived key by encrypting the * master key with AES-128-ECB using the nonce as the AES key. This provides a * unique derived key with sufficient entropy for each inode. However, it's * nonstandard, non-extensible, doesn't evenly distribute the entropy from the * master key, and is trivially reversible: an attacker who compromises a * derived key can "decrypt" it to get back to the master key, then derive any * other key. For all new code, use HKDF instead. * * The master key must be at least as long as the derived key. If the master * key is longer, then only the first 'derived_keysize' bytes are used. */ static int derive_key_aes(const u8 *master_key, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); if (IS_ERR(tfm)) { res = PTR_ERR(tfm); tfm = NULL; goto out; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { res = -ENOMEM; goto out; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); if (res < 0) goto out; sg_init_one(&src_sg, master_key, derived_keysize); sg_init_one(&dst_sg, derived_key, derived_keysize); skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); return res; } /* * Search the current task's subscribed keyrings for a "logon" key with * description prefix:descriptor, and if found acquire a read lock on it and * return a pointer to its validated payload in *payload_ret. */ static struct key * find_and_lock_process_key(const char *prefix, const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE], unsigned int min_keysize, const struct fscrypt_key **payload_ret) { char *description; struct key *key; const struct user_key_payload *ukp; const struct fscrypt_key *payload; description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); key = request_key(&key_type_logon, description, NULL); kfree(description); if (IS_ERR(key)) return key; down_read(&key->sem); ukp = user_key_payload_locked(key); if (!ukp) /* was the key revoked before we acquired its semaphore? */ goto invalid; payload = (const struct fscrypt_key *)ukp->data; if (ukp->datalen != sizeof(struct fscrypt_key) || payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { fscrypt_warn(NULL, "key with description '%s' has invalid payload", key->description); goto invalid; } if (payload->size < min_keysize) { fscrypt_warn(NULL, "key with description '%s' is too short (got %u bytes, need %u+ bytes)", key->description, payload->size, min_keysize); goto invalid; } *payload_ret = payload; return key; invalid: up_read(&key->sem); key_put(key); return ERR_PTR(-ENOKEY); } /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; }; static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) { if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock)) return; hash_del(&dk->dk_node); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(dk); } /* * Find/insert the given key into the fscrypt_direct_keys table. If found, it * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise * NULL is returned. */ static struct fscrypt_direct_key * find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, const u8 *raw_key, const struct fscrypt_inode_info *ci) { unsigned long hash_key; struct fscrypt_direct_key *dk; /* * Careful: to avoid potentially leaking secret key bytes via timing * information, we must key the hash table by descriptor rather than by * raw key, and use crypto_memneq() when comparing raw keys. */ BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, sizeof(hash_key)); spin_lock(&fscrypt_direct_keys_lock); hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) { if (memcmp(ci->ci_policy.v1.master_key_descriptor, dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0) continue; if (ci->ci_mode != dk->dk_mode) continue; if (!fscrypt_is_key_prepared(&dk->dk_key, ci)) continue; if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) continue; /* using existing tfm with same (descriptor, mode, raw_key) */ refcount_inc(&dk->dk_refcount); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(to_insert); return dk; } if (to_insert) hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key); spin_unlock(&fscrypt_direct_keys_lock); return to_insert; } /* Prepare to encrypt directly using the master key in the given mode */ static struct fscrypt_direct_key * fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key) { struct fscrypt_direct_key *dk; int err; /* Is there already a tfm for this key? */ dk = find_or_insert_direct_key(NULL, raw_key, ci); if (dk) return dk; /* Nope, allocate one. */ dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); if (err) goto err_free_dk; memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); return find_or_insert_direct_key(dk, raw_key, ci); err_free_dk: free_direct_key(dk); return ERR_PTR(err); } /* v1 policy, DIRECT_KEY: use the master key directly */ static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { struct fscrypt_direct_key *dk; dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); ci->ci_direct_key = dk; ci->ci_enc_key = dk->dk_key; return 0; } /* v1 policy, !DIRECT_KEY: derive the file's encryption key */ static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { u8 *derived_key; int err; /* * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; err = derive_key_aes(raw_master_key, ci->ci_nonce, derived_key, ci->ci_mode->keysize); if (err) goto out; err = fscrypt_set_per_file_enc_key(ci, derived_key); out: kfree_sensitive(derived_key); return err; } int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return setup_v1_file_key_direct(ci, raw_master_key); else return setup_v1_file_key_derived(ci, raw_master_key); } int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci) { const struct super_block *sb = ci->ci_inode->i_sb; struct key *key; const struct fscrypt_key *payload; int err; key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) { key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); } if (IS_ERR(key)) return PTR_ERR(key); err = fscrypt_setup_v1_file_key(ci, payload->raw); up_read(&key->sem); key_put(key); return err; } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_FSGSBASE_H #define _ASM_FSGSBASE_H #ifndef __ASSEMBLY__ #ifdef CONFIG_X86_64 #include <asm/msr.h> /* * Read/write a task's FSBASE or GSBASE. This returns the value that * the FS/GS base would have (if the task were to be resumed). These * work on the current task or on a non-running (typically stopped * ptrace child) task. */ extern unsigned long x86_fsbase_read_task(struct task_struct *task); extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); /* Must be protected by X86_FEATURE_FSGSBASE check. */ static __always_inline unsigned long rdfsbase(void) { unsigned long fsbase; asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); return fsbase; } static __always_inline unsigned long rdgsbase(void) { unsigned long gsbase; asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); return gsbase; } static __always_inline void wrfsbase(unsigned long fsbase) { asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); } static __always_inline void wrgsbase(unsigned long gsbase) { asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } #include <asm/cpufeature.h> /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; if (boot_cpu_has(X86_FEATURE_FSGSBASE)) fsbase = rdfsbase(); else rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } static inline void x86_fsbase_write_cpu(unsigned long fsbase) { if (boot_cpu_has(X86_FEATURE_FSGSBASE)) wrfsbase(fsbase); else wrmsrl(MSR_FS_BASE, fsbase); } extern unsigned long x86_gsbase_read_cpu_inactive(void); extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector); #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_FSGSBASE_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ #ifndef __AF_VSOCK_H__ #define __AF_VSOCK_H__ #include <linux/kernel.h> #include <linux/workqueue.h> #include <net/sock.h> #include <uapi/linux/vm_sockets.h> #include "vsock_addr.h" #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; extern spinlock_t vsock_table_lock; #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) struct vsock_sock { /* sk must be the first member. */ struct sock sk; const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ struct list_head bound_table; struct list_head connected_table; /* Accessed without the socket lock held. This means it can never be * modified outsided of socket create or destruct. */ bool trusted; bool cached_peer_allow_dgram; /* Dgram communication allowed to * cached peer? */ u32 cached_peer; /* Context ID of last dgram destination check. */ const struct cred *owner; /* Rest are SOCK_STREAM only. */ long connect_timeout; /* Listening socket that this came from. */ struct sock *listener; /* Used for pending list and accept queue during connection handshake. * The listening socket is the head for both lists. Sockets created * for connection requests are placed in the pending list until they * are connected, at which point they are put in the accept queue list * so they can be accepted in accept(). If accept() cannot accept the * connection, it is marked as rejected so the cleanup function knows * to clean up the socket. */ struct list_head pending_links; struct list_head accept_queue; bool rejected; struct delayed_work connect_work; struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; /* Protected by lock_sock(sk) */ u64 buffer_size; u64 buffer_min_size; u64 buffer_max_size; /* Private to transport. */ void *trans; }; s64 vsock_connectible_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ struct vsock_transport_recv_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ bool notify_on_block; }; struct vsock_transport_send_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ }; /* Transport features flags */ /* Transport provides host->guest communication */ #define VSOCK_TRANSPORT_F_H2G 0x00000001 /* Transport provides guest->host communication */ #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 /* Transport provides local (loopback) communication */ #define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); /* Cancel all pending packets sent on vsock. */ int (*cancel_pkt)(struct vsock_sock *vsk); /* Connections. */ int (*connect)(struct vsock_sock *); /* DGRAM. */ int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); bool (*dgram_allow)(u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *, size_t len, int flags); ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *, size_t len); s64 (*stream_has_data)(struct vsock_sock *); s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); bool (*seqpacket_allow)(u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); int (*notify_recv_init)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_block)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, ssize_t, bool, struct vsock_transport_recv_notify_data *); int (*notify_send_init)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_block)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_enqueue)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Addressing. */ u32 (*get_local_cid)(void); /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); /* Zero-copy. */ bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ int vsock_core_register(const struct vsock_transport *t, int features); void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ /* vsock_table_lock must be held */ static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) { return !list_empty(&vsk->bound_table); } /* vsock_table_lock must be held */ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) { return !list_empty(&vsk->connected_table); } void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); /**** TAP ****/ struct vsock_tap { struct net_device *dev; struct module *module; struct list_head list; }; int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); #ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else static inline void __init vsock_bpf_build_proto(void) {} #endif static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } #endif /* __AF_VSOCK_H__ */ |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 | // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro * * Copyright (C) 2010-2012 esd electronic system design gmbh, Matthias Fuchs <socketcan@esd.eu> * Copyright (C) 2022-2023 esd electronics gmbh, Frank Jungclaus <frank.jungclaus@esd.eu> */ #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/units.h> #include <linux/usb.h> MODULE_AUTHOR("Matthias Fuchs <socketcan@esd.eu>"); MODULE_AUTHOR("Frank Jungclaus <frank.jungclaus@esd.eu>"); MODULE_DESCRIPTION("CAN driver for esd electronics gmbh CAN-USB/2, CAN-USB/3 and CAN-USB/Micro interfaces"); MODULE_LICENSE("GPL v2"); /* USB vendor and product ID */ #define ESD_USB_ESDGMBH_VENDOR_ID 0x0ab4 #define ESD_USB_CANUSB2_PRODUCT_ID 0x0010 #define ESD_USB_CANUSBM_PRODUCT_ID 0x0011 #define ESD_USB_CANUSB3_PRODUCT_ID 0x0014 /* CAN controller clock frequencies */ #define ESD_USB_2_CAN_CLOCK (60 * MEGA) /* Hz */ #define ESD_USB_M_CAN_CLOCK (36 * MEGA) /* Hz */ #define ESD_USB_3_CAN_CLOCK (80 * MEGA) /* Hz */ /* Maximum number of CAN nets */ #define ESD_USB_MAX_NETS 2 /* USB commands */ #define ESD_USB_CMD_VERSION 1 /* also used for VERSION_REPLY */ #define ESD_USB_CMD_CAN_RX 2 /* device to host only */ #define ESD_USB_CMD_CAN_TX 3 /* also used for TX_DONE */ #define ESD_USB_CMD_SETBAUD 4 /* also used for SETBAUD_REPLY */ #define ESD_USB_CMD_TS 5 /* also used for TS_REPLY */ #define ESD_USB_CMD_IDADD 6 /* also used for IDADD_REPLY */ /* esd CAN message flags - dlc field */ #define ESD_USB_RTR BIT(4) #define ESD_USB_NO_BRS BIT(4) #define ESD_USB_ESI BIT(5) #define ESD_USB_FD BIT(7) /* esd CAN message flags - id field */ #define ESD_USB_EXTID BIT(29) #define ESD_USB_EVENT BIT(30) #define ESD_USB_IDMASK GENMASK(28, 0) /* esd CAN event ids */ #define ESD_USB_EV_CAN_ERROR_EXT 2 /* CAN controller specific diagnostic data */ /* baudrate message flags */ #define ESD_USB_LOM BIT(30) /* Listen Only Mode */ #define ESD_USB_UBR BIT(31) /* User Bit Rate (controller BTR) in bits 0..27 */ #define ESD_USB_NO_BAUDRATE GENMASK(30, 0) /* bit rate unconfigured */ /* bit timing esd CAN-USB */ #define ESD_USB_2_TSEG1_SHIFT 16 #define ESD_USB_2_TSEG2_SHIFT 20 #define ESD_USB_2_SJW_SHIFT 14 #define ESD_USB_M_SJW_SHIFT 24 #define ESD_USB_TRIPLE_SAMPLES BIT(23) /* Transmitter Delay Compensation */ #define ESD_USB_3_TDC_MODE_AUTO 0 /* esd IDADD message */ #define ESD_USB_ID_ENABLE BIT(7) #define ESD_USB_MAX_ID_SEGMENT 64 /* SJA1000 ECC register (emulated by usb firmware) */ #define ESD_USB_SJA1000_ECC_SEG GENMASK(4, 0) #define ESD_USB_SJA1000_ECC_DIR BIT(5) #define ESD_USB_SJA1000_ECC_ERR BIT(2, 1) #define ESD_USB_SJA1000_ECC_BIT 0x00 #define ESD_USB_SJA1000_ECC_FORM BIT(6) #define ESD_USB_SJA1000_ECC_STUFF BIT(7) #define ESD_USB_SJA1000_ECC_MASK GENMASK(7, 6) /* esd bus state event codes */ #define ESD_USB_BUSSTATE_MASK GENMASK(7, 6) #define ESD_USB_BUSSTATE_WARN BIT(6) #define ESD_USB_BUSSTATE_ERRPASSIVE BIT(7) #define ESD_USB_BUSSTATE_BUSOFF GENMASK(7, 6) #define ESD_USB_RX_BUFFER_SIZE 1024 #define ESD_USB_MAX_RX_URBS 4 #define ESD_USB_MAX_TX_URBS 16 /* must be power of 2 */ /* Modes for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.mode */ #define ESD_USB_3_BAUDRATE_MODE_DISABLE 0 /* remove from bus */ #define ESD_USB_3_BAUDRATE_MODE_INDEX 1 /* ESD (CiA) bit rate idx */ #define ESD_USB_3_BAUDRATE_MODE_BTR_CTRL 2 /* BTR values (controller)*/ #define ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL 3 /* BTR values (canonical) */ #define ESD_USB_3_BAUDRATE_MODE_NUM 4 /* numerical bit rate */ #define ESD_USB_3_BAUDRATE_MODE_AUTOBAUD 5 /* autobaud */ /* Flags for CAN-USB/3, to be used for esd_usb_3_set_baudrate_msg_x.flags */ #define ESD_USB_3_BAUDRATE_FLAG_FD BIT(0) /* enable CAN FD mode */ #define ESD_USB_3_BAUDRATE_FLAG_LOM BIT(1) /* enable listen only mode */ #define ESD_USB_3_BAUDRATE_FLAG_STM BIT(2) /* enable self test mode */ #define ESD_USB_3_BAUDRATE_FLAG_TRS BIT(3) /* enable triple sampling */ #define ESD_USB_3_BAUDRATE_FLAG_TXP BIT(4) /* enable transmit pause */ struct esd_usb_header_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd[2]; }; struct esd_usb_version_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 rsvd; u8 flags; __le32 drv_version; }; struct esd_usb_version_reply_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 nets; u8 features; __le32 version; u8 name[16]; __le32 rsvd; __le32 ts; }; struct esd_usb_rx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; __le32 ts; __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; struct { u8 status; /* CAN Controller Status */ u8 ecc; /* Error Capture Register */ u8 rec; /* RX Error Counter */ u8 tec; /* TX Error Counter */ } ev_can_err_ext; /* For ESD_EV_CAN_ERROR_EXT */ }; }; struct esd_usb_tx_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 dlc; u32 hnd; /* opaque handle, not used by device */ __le32 id; /* upper 3 bits contain flags */ union { u8 data[CAN_MAX_DLEN]; u8 data_fd[CANFD_MAX_DLEN]; }; }; struct esd_usb_tx_done_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 status; u32 hnd; /* opaque handle, not used by device */ __le32 ts; }; struct esd_usb_id_filter_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 option; __le32 mask[ESD_USB_MAX_ID_SEGMENT + 1]; /* +1 for 29bit extended IDs */ }; struct esd_usb_set_baudrate_msg { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; __le32 baud; }; /* CAN-USB/3 baudrate configuration, used for nominal as well as for data bit rate */ struct esd_usb_3_baudrate_cfg { __le16 brp; /* bit rate pre-scaler */ __le16 tseg1; /* time segment before sample point */ __le16 tseg2; /* time segment after sample point */ __le16 sjw; /* synchronization jump Width */ }; /* In principle, the esd CAN-USB/3 supports Transmitter Delay Compensation (TDC), * but currently only the automatic TDC mode is supported by this driver. * An implementation for manual TDC configuration will follow. * * For information about struct esd_usb_3_tdc_cfg, see * NTCAN Application Developers Manual, 6.2.25 NTCAN_TDC_CFG + related chapters * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_tdc_cfg { u8 tdc_mode; /* transmitter delay compensation mode */ u8 ssp_offset; /* secondary sample point offset in mtq */ s8 ssp_shift; /* secondary sample point shift in mtq */ u8 tdc_filter; /* TDC filter in mtq */ }; /* Extended version of the above set_baudrate_msg for a CAN-USB/3 * to define the CAN bit timing configuration of the CAN controller in * CAN FD mode as well as in Classical CAN mode. * * The payload of this command is a NTCAN_BAUDRATE_X structure according to * esd electronics gmbh, NTCAN Application Developers Manual, 6.2.15 NTCAN_BAUDRATE_X * https://esd.eu/fileadmin/esd/docs/manuals/NTCAN_Part1_Function_API_Manual_en_56.pdf */ struct esd_usb_3_set_baudrate_msg_x { u8 len; /* total message length in 32bit words */ u8 cmd; u8 net; u8 rsvd; /*reserved */ /* Payload ... */ __le16 mode; /* mode word, see ESD_USB_3_BAUDRATE_MODE_xxx */ __le16 flags; /* control flags, see ESD_USB_3_BAUDRATE_FLAG_xxx */ struct esd_usb_3_tdc_cfg tdc; /* TDC configuration */ struct esd_usb_3_baudrate_cfg nom; /* nominal bit rate */ struct esd_usb_3_baudrate_cfg data; /* data bit rate */ }; /* Main message type used between library and application */ union __packed esd_usb_msg { struct esd_usb_header_msg hdr; struct esd_usb_version_msg version; struct esd_usb_version_reply_msg version_reply; struct esd_usb_rx_msg rx; struct esd_usb_tx_msg tx; struct esd_usb_tx_done_msg txdone; struct esd_usb_set_baudrate_msg setbaud; struct esd_usb_3_set_baudrate_msg_x setbaud_x; struct esd_usb_id_filter_msg filter; }; static struct usb_device_id esd_usb_table[] = { {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB2_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSBM_PRODUCT_ID)}, {USB_DEVICE(ESD_USB_ESDGMBH_VENDOR_ID, ESD_USB_CANUSB3_PRODUCT_ID)}, {} }; MODULE_DEVICE_TABLE(usb, esd_usb_table); struct esd_usb_net_priv; struct esd_tx_urb_context { struct esd_usb_net_priv *priv; u32 echo_index; }; struct esd_usb { struct usb_device *udev; struct esd_usb_net_priv *nets[ESD_USB_MAX_NETS]; struct usb_anchor rx_submitted; int net_count; u32 version; int rxinitdone; void *rxbuf[ESD_USB_MAX_RX_URBS]; dma_addr_t rxbuf_dma[ESD_USB_MAX_RX_URBS]; }; struct esd_usb_net_priv { struct can_priv can; /* must be the first member */ atomic_t active_tx_jobs; struct usb_anchor tx_submitted; struct esd_tx_urb_context tx_contexts[ESD_USB_MAX_TX_URBS]; struct esd_usb *usb; struct net_device *netdev; int index; u8 old_state; struct can_berr_counter bec; }; static void esd_usb_rx_event(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct sk_buff *skb; u32 id = le32_to_cpu(msg->rx.id) & ESD_USB_IDMASK; if (id == ESD_USB_EV_CAN_ERROR_EXT) { u8 state = msg->rx.ev_can_err_ext.status; u8 ecc = msg->rx.ev_can_err_ext.ecc; priv->bec.rxerr = msg->rx.ev_can_err_ext.rec; priv->bec.txerr = msg->rx.ev_can_err_ext.tec; netdev_dbg(priv->netdev, "CAN_ERR_EV_EXT: dlc=%#02x state=%02x ecc=%02x rec=%02x tec=%02x\n", msg->rx.dlc, state, ecc, priv->bec.rxerr, priv->bec.txerr); /* if berr-reporting is off, only pass through on state change ... */ if (!(priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) && state == priv->old_state) return; skb = alloc_can_err_skb(priv->netdev, &cf); if (!skb) stats->rx_dropped++; if (state != priv->old_state) { enum can_state tx_state, rx_state; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; priv->old_state = state; switch (state & ESD_USB_BUSSTATE_MASK) { case ESD_USB_BUSSTATE_BUSOFF: new_state = CAN_STATE_BUS_OFF; can_bus_off(priv->netdev); break; case ESD_USB_BUSSTATE_WARN: new_state = CAN_STATE_ERROR_WARNING; break; case ESD_USB_BUSSTATE_ERRPASSIVE: new_state = CAN_STATE_ERROR_PASSIVE; break; default: new_state = CAN_STATE_ERROR_ACTIVE; priv->bec.txerr = 0; priv->bec.rxerr = 0; break; } if (new_state != priv->can.state) { tx_state = (priv->bec.txerr >= priv->bec.rxerr) ? new_state : 0; rx_state = (priv->bec.txerr <= priv->bec.rxerr) ? new_state : 0; can_change_state(priv->netdev, cf, tx_state, rx_state); } } else if (skb) { priv->can.can_stats.bus_error++; stats->rx_errors++; cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; switch (ecc & ESD_USB_SJA1000_ECC_MASK) { case ESD_USB_SJA1000_ECC_BIT: cf->data[2] |= CAN_ERR_PROT_BIT; break; case ESD_USB_SJA1000_ECC_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; break; case ESD_USB_SJA1000_ECC_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; break; default: break; } /* Error occurred during transmission? */ if (!(ecc & ESD_USB_SJA1000_ECC_DIR)) cf->data[2] |= CAN_ERR_PROT_TX; /* Bit stream position in CAN frame as the error was detected */ cf->data[3] = ecc & ESD_USB_SJA1000_ECC_SEG; } if (skb) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = priv->bec.txerr; cf->data[7] = priv->bec.rxerr; netif_rx(skb); } } } static void esd_usb_rx_can_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct can_frame *cf; struct canfd_frame *cfd; struct sk_buff *skb; u32 id; u8 len; if (!netif_device_present(priv->netdev)) return; id = le32_to_cpu(msg->rx.id); if (id & ESD_USB_EVENT) { esd_usb_rx_event(priv, msg); } else { if (msg->rx.dlc & ESD_USB_FD) { skb = alloc_canfd_skb(priv->netdev, &cfd); } else { skb = alloc_can_skb(priv->netdev, &cf); cfd = (struct canfd_frame *)cf; } if (skb == NULL) { stats->rx_dropped++; return; } cfd->can_id = id & ESD_USB_IDMASK; if (msg->rx.dlc & ESD_USB_FD) { /* masking by 0x0F is already done within can_fd_dlc2len() */ cfd->len = can_fd_dlc2len(msg->rx.dlc); len = cfd->len; if ((msg->rx.dlc & ESD_USB_NO_BRS) == 0) cfd->flags |= CANFD_BRS; if (msg->rx.dlc & ESD_USB_ESI) cfd->flags |= CANFD_ESI; } else { can_frame_set_cc_len(cf, msg->rx.dlc & ~ESD_USB_RTR, priv->can.ctrlmode); len = cf->len; if (msg->rx.dlc & ESD_USB_RTR) { cf->can_id |= CAN_RTR_FLAG; len = 0; } } if (id & ESD_USB_EXTID) cfd->can_id |= CAN_EFF_FLAG; memcpy(cfd->data, msg->rx.data_fd, len); stats->rx_bytes += len; stats->rx_packets++; netif_rx(skb); } } static void esd_usb_tx_done_msg(struct esd_usb_net_priv *priv, union esd_usb_msg *msg) { struct net_device_stats *stats = &priv->netdev->stats; struct net_device *netdev = priv->netdev; struct esd_tx_urb_context *context; if (!netif_device_present(netdev)) return; context = &priv->tx_contexts[msg->txdone.hnd & (ESD_USB_MAX_TX_URBS - 1)]; if (!msg->txdone.status) { stats->tx_packets++; stats->tx_bytes += can_get_echo_skb(netdev, context->echo_index, NULL); } else { stats->tx_errors++; can_free_echo_skb(netdev, context->echo_index, NULL); } /* Release context */ context->echo_index = ESD_USB_MAX_TX_URBS; atomic_dec(&priv->active_tx_jobs); netif_wake_queue(netdev); } static void esd_usb_read_bulk_callback(struct urb *urb) { struct esd_usb *dev = urb->context; int retval; int pos = 0; int i; switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: dev_info(dev->udev->dev.parent, "Rx URB aborted (%d)\n", urb->status); goto resubmit_urb; } while (pos < urb->actual_length) { union esd_usb_msg *msg; msg = (union esd_usb_msg *)(urb->transfer_buffer + pos); switch (msg->hdr.cmd) { case ESD_USB_CMD_CAN_RX: if (msg->rx.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_rx_can_msg(dev->nets[msg->rx.net], msg); break; case ESD_USB_CMD_CAN_TX: if (msg->txdone.net >= dev->net_count) { dev_err(dev->udev->dev.parent, "format error\n"); break; } esd_usb_tx_done_msg(dev->nets[msg->txdone.net], msg); break; } pos += msg->hdr.len * sizeof(u32); /* convert to # of bytes */ if (pos > urb->actual_length) { dev_err(dev->udev->dev.parent, "format error\n"); break; } } resubmit_urb: usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), urb->transfer_buffer, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval == -ENODEV) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) netif_device_detach(dev->nets[i]->netdev); } } else if (retval) { dev_err(dev->udev->dev.parent, "failed resubmitting read bulk urb: %d\n", retval); } } /* callback for bulk IN urb */ static void esd_usb_write_bulk_callback(struct urb *urb) { struct esd_tx_urb_context *context = urb->context; struct esd_usb_net_priv *priv; struct net_device *netdev; size_t size = sizeof(union esd_usb_msg); WARN_ON(!context); priv = context->priv; netdev = priv->netdev; /* free up our allocated buffer */ usb_free_coherent(urb->dev, size, urb->transfer_buffer, urb->transfer_dma); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "Tx URB aborted (%d)\n", urb->status); netif_trans_update(netdev); } static ssize_t firmware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 12) & 0xf, (dev->version >> 8) & 0xf, dev->version & 0xff); } static DEVICE_ATTR_RO(firmware); static ssize_t hardware_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d.%d.%d\n", (dev->version >> 28) & 0xf, (dev->version >> 24) & 0xf, (dev->version >> 16) & 0xff); } static DEVICE_ATTR_RO(hardware); static ssize_t nets_show(struct device *d, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(d); struct esd_usb *dev = usb_get_intfdata(intf); return sprintf(buf, "%d", dev->net_count); } static DEVICE_ATTR_RO(nets); static int esd_usb_send_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, 2), msg, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ &actual_length, 1000); } static int esd_usb_wait_msg(struct esd_usb *dev, union esd_usb_msg *msg) { int actual_length; return usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, 1), msg, sizeof(*msg), &actual_length, 1000); } static int esd_usb_setup_rx_urbs(struct esd_usb *dev) { int i, err = 0; if (dev->rxinitdone) return 0; for (i = 0; i < ESD_USB_MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf = NULL; dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { err = -ENOMEM; break; } buf = usb_alloc_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, GFP_KERNEL, &buf_dma); if (!buf) { dev_warn(dev->udev->dev.parent, "No memory left for USB buffer\n"); err = -ENOMEM; goto freeurb; } urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), buf, ESD_USB_RX_BUFFER_SIZE, esd_usb_read_bulk_callback, dev); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &dev->rx_submitted); err = usb_submit_urb(urb, GFP_KERNEL); if (err) { usb_unanchor_urb(urb); usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, buf, urb->transfer_dma); goto freeurb; } dev->rxbuf[i] = buf; dev->rxbuf_dma[i] = buf_dma; freeurb: /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); if (err) break; } /* Did we submit any URBs */ if (i == 0) { dev_err(dev->udev->dev.parent, "couldn't setup read URBs\n"); return err; } /* Warn if we've couldn't transmit all the URBs */ if (i < ESD_USB_MAX_RX_URBS) { dev_warn(dev->udev->dev.parent, "rx performance may be slow\n"); } dev->rxinitdone = 1; return 0; } /* Start interface */ static int esd_usb_start(struct esd_usb_net_priv *priv) { struct esd_usb *dev = priv->usb; struct net_device *netdev = priv->netdev; union esd_usb_msg *msg; int err, i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } /* Enable all IDs * The IDADD message takes up to 64 32 bit bitmasks (2048 bits). * Each bit represents one 11 bit CAN identifier. A set bit * enables reception of the corresponding CAN identifier. A cleared * bit disabled this identifier. An additional bitmask value * following the CAN 2.0A bits is used to enable reception of * extended CAN frames. Only the LSB of this final mask is checked * for the complete 29 bit ID range. The IDADD message also allows * filter configuration for an ID subset. In this case you can add * the number of the starting bitmask (0..64) to the filter.option * field followed by only some bitmasks. */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32); /* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i < ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = cpu_to_le32(GENMASK(31, 0)); /* enable 29bit extended IDs */ msg->filter.mask[ESD_USB_MAX_ID_SEGMENT] = cpu_to_le32(BIT(0)); err = esd_usb_send_msg(dev, msg); if (err) goto out; err = esd_usb_setup_rx_urbs(dev); if (err) goto out; priv->can.state = CAN_STATE_ERROR_ACTIVE; out: if (err == -ENODEV) netif_device_detach(netdev); if (err) netdev_err(netdev, "couldn't start device: %d\n", err); kfree(msg); return err; } static void unlink_all_urbs(struct esd_usb *dev) { struct esd_usb_net_priv *priv; int i, j; usb_kill_anchored_urbs(&dev->rx_submitted); for (i = 0; i < ESD_USB_MAX_RX_URBS; ++i) usb_free_coherent(dev->udev, ESD_USB_RX_BUFFER_SIZE, dev->rxbuf[i], dev->rxbuf_dma[i]); for (i = 0; i < dev->net_count; i++) { priv = dev->nets[i]; if (priv) { usb_kill_anchored_urbs(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (j = 0; j < ESD_USB_MAX_TX_URBS; j++) priv->tx_contexts[j].echo_index = ESD_USB_MAX_TX_URBS; } } } static int esd_usb_open(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); int err; /* common open */ err = open_candev(netdev); if (err) return err; /* finally start device */ err = esd_usb_start(priv); if (err) { netdev_warn(netdev, "couldn't start device: %d\n", err); close_candev(netdev); return err; } netif_start_queue(netdev); return 0; } static netdev_tx_t esd_usb_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); struct esd_usb *dev = priv->usb; struct esd_tx_urb_context *context = NULL; struct net_device_stats *stats = &netdev->stats; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; union esd_usb_msg *msg; struct urb *urb; u8 *buf; int i, err; int ret = NETDEV_TX_OK; size_t size = sizeof(union esd_usb_msg); if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { stats->tx_dropped++; dev_kfree_skb(skb); goto nourbmem; } buf = usb_alloc_coherent(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); stats->tx_dropped++; dev_kfree_skb(skb); goto nobufmem; } msg = (union esd_usb_msg *)buf; /* minimal length as # of 32bit words */ msg->hdr.len = offsetof(struct esd_usb_tx_msg, data) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_CAN_TX; msg->tx.net = priv->index; if (can_is_canfd_skb(skb)) { msg->tx.dlc = can_fd_len2dlc(cfd->len); msg->tx.dlc |= ESD_USB_FD; if ((cfd->flags & CANFD_BRS) == 0) msg->tx.dlc |= ESD_USB_NO_BRS; } else { msg->tx.dlc = can_get_cc_dlc((struct can_frame *)cfd, priv->can.ctrlmode); if (cfd->can_id & CAN_RTR_FLAG) msg->tx.dlc |= ESD_USB_RTR; } msg->tx.id = cpu_to_le32(cfd->can_id & CAN_ERR_MASK); if (cfd->can_id & CAN_EFF_FLAG) msg->tx.id |= cpu_to_le32(ESD_USB_EXTID); memcpy(msg->tx.data_fd, cfd->data, cfd->len); /* round up, then divide by 4 to add the payload length as # of 32bit words */ msg->hdr.len += DIV_ROUND_UP(cfd->len, sizeof(u32)); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) { if (priv->tx_contexts[i].echo_index == ESD_USB_MAX_TX_URBS) { context = &priv->tx_contexts[i]; break; } } /* This may never happen */ if (!context) { netdev_warn(netdev, "couldn't find free context\n"); ret = NETDEV_TX_BUSY; goto releasebuf; } context->priv = priv; context->echo_index = i; /* hnd must not be 0 - MSB is stripped in txdone handling */ msg->tx.hnd = BIT(31) | i; /* returned in TX done message */ usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), buf, msg->hdr.len * sizeof(u32), /* convert to # of bytes */ esd_usb_write_bulk_callback, context); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_jobs); /* Slow down tx path */ if (atomic_read(&priv->active_tx_jobs) >= ESD_USB_MAX_TX_URBS) netif_stop_queue(netdev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { can_free_echo_skb(netdev, context->echo_index, NULL); atomic_dec(&priv->active_tx_jobs); usb_unanchor_urb(urb); stats->tx_dropped++; if (err == -ENODEV) netif_device_detach(netdev); else netdev_warn(netdev, "failed tx_urb %d\n", err); goto releasebuf; } netif_trans_update(netdev); /* Release our reference to this URB, the USB core will eventually free * it entirely. */ usb_free_urb(urb); return NETDEV_TX_OK; releasebuf: usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); nobufmem: usb_free_urb(urb); nourbmem: return ret; } static int esd_usb_close(struct net_device *netdev) { struct esd_usb_net_priv *priv = netdev_priv(netdev); union esd_usb_msg *msg; int i; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; /* Disable all IDs (see esd_usb_start()) */ msg->hdr.cmd = ESD_USB_CMD_IDADD; msg->hdr.len = sizeof(struct esd_usb_id_filter_msg) / sizeof(u32);/* # of 32bit words */ msg->filter.net = priv->index; msg->filter.option = ESD_USB_ID_ENABLE; /* start with segment 0 */ for (i = 0; i <= ESD_USB_MAX_ID_SEGMENT; i++) msg->filter.mask[i] = 0; if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending idadd message failed\n"); /* set CAN controller to reset mode */ msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(ESD_USB_NO_BAUDRATE); if (esd_usb_send_msg(priv->usb, msg) < 0) netdev_err(netdev, "sending setbaud message failed\n"); priv->can.state = CAN_STATE_STOPPED; netif_stop_queue(netdev); close_candev(netdev); kfree(msg); return 0; } static const struct net_device_ops esd_usb_netdev_ops = { .ndo_open = esd_usb_open, .ndo_stop = esd_usb_close, .ndo_start_xmit = esd_usb_start_xmit, .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops esd_usb_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const struct can_bittiming_const esd_usb_2_bittiming_const = { .name = "esd_usb_2", .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; static int esd_usb_2_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *btc = &esd_usb_2_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *bt = &priv->can.bittiming; union esd_usb_msg *msg; int err; u32 canbtr; int sjw_shift; canbtr = ESD_USB_UBR; if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) canbtr |= ESD_USB_LOM; canbtr |= (bt->brp - 1) & (btc->brp_max - 1); if (le16_to_cpu(priv->usb->udev->descriptor.idProduct) == ESD_USB_CANUSBM_PRODUCT_ID) sjw_shift = ESD_USB_M_SJW_SHIFT; else sjw_shift = ESD_USB_2_SJW_SHIFT; canbtr |= ((bt->sjw - 1) & (btc->sjw_max - 1)) << sjw_shift; canbtr |= ((bt->prop_seg + bt->phase_seg1 - 1) & (btc->tseg1_max - 1)) << ESD_USB_2_TSEG1_SHIFT; canbtr |= ((bt->phase_seg2 - 1) & (btc->tseg2_max - 1)) << ESD_USB_2_TSEG2_SHIFT; if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) canbtr |= ESD_USB_TRIPLE_SAMPLES; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; msg->hdr.len = sizeof(struct esd_usb_set_baudrate_msg) / sizeof(u32); /* # of 32bit words */ msg->hdr.cmd = ESD_USB_CMD_SETBAUD; msg->setbaud.net = priv->index; msg->setbaud.rsvd = 0; msg->setbaud.baud = cpu_to_le32(canbtr); netdev_dbg(netdev, "setting BTR=%#x\n", canbtr); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } /* Nominal bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.8 MCAN Nominal Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_nom_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 256, .tseg2_min = 2, .tseg2_max = 128, .sjw_max = 128, .brp_min = 1, .brp_max = 512, .brp_inc = 1, }; /* Data bittiming constants, see * Microchip SAM E70/S70/V70/V71, Data Sheet, Rev. G - 07/2022 * 48.6.4 MCAN Data Bit Timing and Prescaler Register */ static const struct can_bittiming_const esd_usb_3_data_bittiming_const = { .name = "esd_usb_3", .tseg1_min = 2, .tseg1_max = 32, .tseg2_min = 1, .tseg2_max = 16, .sjw_max = 8, .brp_min = 1, .brp_max = 32, .brp_inc = 1, }; static int esd_usb_3_set_bittiming(struct net_device *netdev) { const struct can_bittiming_const *nom_btc = &esd_usb_3_nom_bittiming_const; const struct can_bittiming_const *data_btc = &esd_usb_3_data_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *nom_bt = &priv->can.bittiming; struct can_bittiming *data_bt = &priv->can.data_bittiming; struct esd_usb_3_set_baudrate_msg_x *baud_x; union esd_usb_msg *msg; u16 flags = 0; int err; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; baud_x = &msg->setbaud_x; /* Canonical is the most reasonable mode for SocketCAN on CAN-USB/3 ... */ baud_x->mode = cpu_to_le16(ESD_USB_3_BAUDRATE_MODE_BTR_CANONICAL); if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= ESD_USB_3_BAUDRATE_FLAG_LOM; if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) flags |= ESD_USB_3_BAUDRATE_FLAG_TRS; baud_x->nom.brp = cpu_to_le16(nom_bt->brp & (nom_btc->brp_max - 1)); baud_x->nom.sjw = cpu_to_le16(nom_bt->sjw & (nom_btc->sjw_max - 1)); baud_x->nom.tseg1 = cpu_to_le16((nom_bt->prop_seg + nom_bt->phase_seg1) & (nom_btc->tseg1_max - 1)); baud_x->nom.tseg2 = cpu_to_le16(nom_bt->phase_seg2 & (nom_btc->tseg2_max - 1)); if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { baud_x->data.brp = cpu_to_le16(data_bt->brp & (data_btc->brp_max - 1)); baud_x->data.sjw = cpu_to_le16(data_bt->sjw & (data_btc->sjw_max - 1)); baud_x->data.tseg1 = cpu_to_le16((data_bt->prop_seg + data_bt->phase_seg1) & (data_btc->tseg1_max - 1)); baud_x->data.tseg2 = cpu_to_le16(data_bt->phase_seg2 & (data_btc->tseg2_max - 1)); flags |= ESD_USB_3_BAUDRATE_FLAG_FD; } /* Currently this driver only supports the automatic TDC mode */ baud_x->tdc.tdc_mode = ESD_USB_3_TDC_MODE_AUTO; baud_x->tdc.ssp_offset = 0; baud_x->tdc.ssp_shift = 0; baud_x->tdc.tdc_filter = 0; baud_x->flags = cpu_to_le16(flags); baud_x->net = priv->index; baud_x->rsvd = 0; /* set len as # of 32bit words */ msg->hdr.len = sizeof(struct esd_usb_3_set_baudrate_msg_x) / sizeof(u32); msg->hdr.cmd = ESD_USB_CMD_SETBAUD; netdev_dbg(netdev, "ctrlmode=%#x/%#x, esd-net=%u, esd-mode=%#x, esd-flags=%#x\n", priv->can.ctrlmode, priv->can.ctrlmode_supported, priv->index, le16_to_cpu(baud_x->mode), flags); err = esd_usb_send_msg(priv->usb, msg); kfree(msg); return err; } static int esd_usb_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct esd_usb_net_priv *priv = netdev_priv(netdev); bec->txerr = priv->bec.txerr; bec->rxerr = priv->bec.rxerr; return 0; } static int esd_usb_set_mode(struct net_device *netdev, enum can_mode mode) { switch (mode) { case CAN_MODE_START: netif_wake_queue(netdev); break; default: return -EOPNOTSUPP; } return 0; } static int esd_usb_probe_one_net(struct usb_interface *intf, int index) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; struct esd_usb_net_priv *priv; int err = 0; int i; netdev = alloc_candev(sizeof(*priv), ESD_USB_MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "couldn't alloc candev\n"); err = -ENOMEM; goto done; } priv = netdev_priv(netdev); init_usb_anchor(&priv->tx_submitted); atomic_set(&priv->active_tx_jobs, 0); for (i = 0; i < ESD_USB_MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = ESD_USB_MAX_TX_URBS; priv->usb = dev; priv->netdev = netdev; priv->index = index; priv->can.state = CAN_STATE_STOPPED; priv->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_CC_LEN8_DLC | CAN_CTRLMODE_BERR_REPORTING; switch (le16_to_cpu(dev->udev->descriptor.idProduct)) { case ESD_USB_CANUSB3_PRODUCT_ID: priv->can.clock.freq = ESD_USB_3_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES; priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD; priv->can.bittiming_const = &esd_usb_3_nom_bittiming_const; priv->can.data_bittiming_const = &esd_usb_3_data_bittiming_const; priv->can.do_set_bittiming = esd_usb_3_set_bittiming; priv->can.do_set_data_bittiming = esd_usb_3_set_bittiming; break; case ESD_USB_CANUSBM_PRODUCT_ID: priv->can.clock.freq = ESD_USB_M_CAN_CLOCK; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; case ESD_USB_CANUSB2_PRODUCT_ID: default: priv->can.clock.freq = ESD_USB_2_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_3_SAMPLES; priv->can.bittiming_const = &esd_usb_2_bittiming_const; priv->can.do_set_bittiming = esd_usb_2_set_bittiming; break; } priv->can.do_set_mode = esd_usb_set_mode; priv->can.do_get_berr_counter = esd_usb_get_berr_counter; netdev->flags |= IFF_ECHO; /* we support local echo */ netdev->netdev_ops = &esd_usb_netdev_ops; netdev->ethtool_ops = &esd_usb_ethtool_ops; SET_NETDEV_DEV(netdev, &intf->dev); netdev->dev_id = index; err = register_candev(netdev); if (err) { dev_err(&intf->dev, "couldn't register CAN device: %d\n", err); free_candev(netdev); err = -ENOMEM; goto done; } dev->nets[index] = priv; netdev_info(netdev, "device %s registered\n", netdev->name); done: return err; } /* probe function for new USB devices * * check version information and number of available * CAN interfaces */ static int esd_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct esd_usb *dev; union esd_usb_msg *msg; int i, err; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto done; } dev->udev = interface_to_usbdev(intf); init_usb_anchor(&dev->rx_submitted); usb_set_intfdata(intf, dev); msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto free_msg; } /* query number of CAN interfaces (nets) */ msg->hdr.cmd = ESD_USB_CMD_VERSION; msg->hdr.len = sizeof(struct esd_usb_version_msg) / sizeof(u32); /* # of 32bit words */ msg->version.rsvd = 0; msg->version.flags = 0; msg->version.drv_version = 0; err = esd_usb_send_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "sending version message failed\n"); goto free_msg; } err = esd_usb_wait_msg(dev, msg); if (err < 0) { dev_err(&intf->dev, "no version message answer\n"); goto free_msg; } dev->net_count = (int)msg->version_reply.nets; dev->version = le32_to_cpu(msg->version_reply.version); if (device_create_file(&intf->dev, &dev_attr_firmware)) dev_err(&intf->dev, "Couldn't create device file for firmware\n"); if (device_create_file(&intf->dev, &dev_attr_hardware)) dev_err(&intf->dev, "Couldn't create device file for hardware\n"); if (device_create_file(&intf->dev, &dev_attr_nets)) dev_err(&intf->dev, "Couldn't create device file for nets\n"); /* do per device probing */ for (i = 0; i < dev->net_count; i++) esd_usb_probe_one_net(intf, i); free_msg: kfree(msg); if (err) kfree(dev); done: return err; } /* called by the usb core when the device is removed from the system */ static void esd_usb_disconnect(struct usb_interface *intf) { struct esd_usb *dev = usb_get_intfdata(intf); struct net_device *netdev; int i; device_remove_file(&intf->dev, &dev_attr_firmware); device_remove_file(&intf->dev, &dev_attr_hardware); device_remove_file(&intf->dev, &dev_attr_nets); usb_set_intfdata(intf, NULL); if (dev) { for (i = 0; i < dev->net_count; i++) { if (dev->nets[i]) { netdev = dev->nets[i]->netdev; unregister_netdev(netdev); free_candev(netdev); } } unlink_all_urbs(dev); kfree(dev); } } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver esd_usb_driver = { .name = KBUILD_MODNAME, .probe = esd_usb_probe, .disconnect = esd_usb_disconnect, .id_table = esd_usb_table, }; module_usb_driver(esd_usb_driver); |
290 1446 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Integer base 2 logarithm calculation * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_LOG2_H #define _LINUX_LOG2_H #include <linux/types.h> #include <linux/bitops.h> /* * non-constant log of base 2 calculators * - the arch may override these in asm/bitops.h if they can be implemented * more efficiently than using fls() and fls64() * - the arch is not required to handle n==0 if implementing the fallback */ #ifndef CONFIG_ARCH_HAS_ILOG2_U32 static __always_inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; } #endif #ifndef CONFIG_ARCH_HAS_ILOG2_U64 static __always_inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; } #endif /** * is_power_of_2() - check if a value is a power of two * @n: the value to check * * Determine whether some value is a power of two, where zero is * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ static inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); } /** * __roundup_pow_of_two() - round up to nearest power of two * @n: value to round up */ static inline __attribute__((const)) unsigned long __roundup_pow_of_two(unsigned long n) { return 1UL << fls_long(n - 1); } /** * __rounddown_pow_of_two() - round down to nearest power of two * @n: value to round down */ static inline __attribute__((const)) unsigned long __rounddown_pow_of_two(unsigned long n) { return 1UL << (fls_long(n) - 1); } /** * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value * @n: parameter * * Use this where sparse expects a true constant expression, e.g. for array * indices. */ #define const_ilog2(n) \ ( \ __builtin_constant_p(n) ? ( \ (n) < 2 ? 0 : \ (n) & (1ULL << 63) ? 63 : \ (n) & (1ULL << 62) ? 62 : \ (n) & (1ULL << 61) ? 61 : \ (n) & (1ULL << 60) ? 60 : \ (n) & (1ULL << 59) ? 59 : \ (n) & (1ULL << 58) ? 58 : \ (n) & (1ULL << 57) ? 57 : \ (n) & (1ULL << 56) ? 56 : \ (n) & (1ULL << 55) ? 55 : \ (n) & (1ULL << 54) ? 54 : \ (n) & (1ULL << 53) ? 53 : \ (n) & (1ULL << 52) ? 52 : \ (n) & (1ULL << 51) ? 51 : \ (n) & (1ULL << 50) ? 50 : \ (n) & (1ULL << 49) ? 49 : \ (n) & (1ULL << 48) ? 48 : \ (n) & (1ULL << 47) ? 47 : \ (n) & (1ULL << 46) ? 46 : \ (n) & (1ULL << 45) ? 45 : \ (n) & (1ULL << 44) ? 44 : \ (n) & (1ULL << 43) ? 43 : \ (n) & (1ULL << 42) ? 42 : \ (n) & (1ULL << 41) ? 41 : \ (n) & (1ULL << 40) ? 40 : \ (n) & (1ULL << 39) ? 39 : \ (n) & (1ULL << 38) ? 38 : \ (n) & (1ULL << 37) ? 37 : \ (n) & (1ULL << 36) ? 36 : \ (n) & (1ULL << 35) ? 35 : \ (n) & (1ULL << 34) ? 34 : \ (n) & (1ULL << 33) ? 33 : \ (n) & (1ULL << 32) ? 32 : \ (n) & (1ULL << 31) ? 31 : \ (n) & (1ULL << 30) ? 30 : \ (n) & (1ULL << 29) ? 29 : \ (n) & (1ULL << 28) ? 28 : \ (n) & (1ULL << 27) ? 27 : \ (n) & (1ULL << 26) ? 26 : \ (n) & (1ULL << 25) ? 25 : \ (n) & (1ULL << 24) ? 24 : \ (n) & (1ULL << 23) ? 23 : \ (n) & (1ULL << 22) ? 22 : \ (n) & (1ULL << 21) ? 21 : \ (n) & (1ULL << 20) ? 20 : \ (n) & (1ULL << 19) ? 19 : \ (n) & (1ULL << 18) ? 18 : \ (n) & (1ULL << 17) ? 17 : \ (n) & (1ULL << 16) ? 16 : \ (n) & (1ULL << 15) ? 15 : \ (n) & (1ULL << 14) ? 14 : \ (n) & (1ULL << 13) ? 13 : \ (n) & (1ULL << 12) ? 12 : \ (n) & (1ULL << 11) ? 11 : \ (n) & (1ULL << 10) ? 10 : \ (n) & (1ULL << 9) ? 9 : \ (n) & (1ULL << 8) ? 8 : \ (n) & (1ULL << 7) ? 7 : \ (n) & (1ULL << 6) ? 6 : \ (n) & (1ULL << 5) ? 5 : \ (n) & (1ULL << 4) ? 4 : \ (n) & (1ULL << 3) ? 3 : \ (n) & (1ULL << 2) ? 2 : \ 1) : \ -1) /** * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value * @n: parameter * * constant-capable log of base 2 calculation * - this can be used to initialise global variables from constant data, hence * the massive ternary operator construction * * selects the appropriately-sized optimised version depending on sizeof(n) */ #define ilog2(n) \ ( \ __builtin_constant_p(n) ? \ ((n) < 2 ? 0 : \ 63 - __builtin_clzll(n)) : \ (sizeof(n) <= 4) ? \ __ilog2_u32(n) : \ __ilog2_u64(n) \ ) /** * roundup_pow_of_two - round the given value up to nearest power of two * @n: parameter * * round the given value up to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define roundup_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 1) ? 1 : \ (1UL << (ilog2((n) - 1) + 1)) \ ) : \ __roundup_pow_of_two(n) \ ) /** * rounddown_pow_of_two - round the given value down to nearest power of two * @n: parameter * * round the given value down to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define rounddown_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ (1UL << ilog2(n))) : \ __rounddown_pow_of_two(n) \ ) static inline __attribute_const__ int __order_base_2(unsigned long n) { return n > 1 ? ilog2(n - 1) + 1 : 0; } /** * order_base_2 - calculate the (rounded up) base 2 order of the argument * @n: parameter * * The first few values calculated by this routine: * ob2(0) = 0 * ob2(1) = 0 * ob2(2) = 1 * ob2(3) = 2 * ob2(4) = 2 * ob2(5) = 3 * ... and so on. */ #define order_base_2(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) ? 0 : \ ilog2((n) - 1) + 1) : \ __order_base_2(n) \ ) static inline __attribute__((const)) int __bits_per(unsigned long n) { if (n < 2) return 1; if (is_power_of_2(n)) return order_base_2(n) + 1; return order_base_2(n); } /** * bits_per - calculate the number of bits required for the argument * @n: parameter * * This is constant-capable and can be used for compile time * initializations, e.g bitfields. * * The first few values calculated by this routine: * bf(0) = 1 * bf(1) = 1 * bf(2) = 2 * bf(3) = 2 * bf(4) = 3 * ... and so on. */ #define bits_per(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) \ ? 1 : ilog2(n) + 1 \ ) : \ __bits_per(n) \ ) #endif /* _LINUX_LOG2_H */ |
1251 1035 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> enum { MAX_NESTED_LINKS = 8 }; #define MAXSYMLINKS 40 /* * Type of the last component on LOOKUP_PARENT */ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ #define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ #define LOOKUP_DIRECTORY 0x0002 /* require a directory */ #define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ #define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ #define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ #define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ #define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ #define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ /* These tell filesystem methods that we are dealing with the final component... */ #define LOOKUP_OPEN 0x0100 /* ... in open */ #define LOOKUP_CREATE 0x0200 /* ... in object creation */ #define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ #define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ /* internal use only */ #define LOOKUP_PARENT 0x0010 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ #define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ #define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ #define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ #define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) extern int path_pts(struct path *path); extern int user_path_at(int, const char __user *, unsigned, struct path *); struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); /** * mode_strip_umask - handle vfs umask stripping * @dir: parent directory of the new inode * @mode: mode of the new inode to be created in @dir * * In most filesystems, umask stripping depends on whether or not the * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask * stripping is done directly in here. If the filesystem does support POSIX * ACLs umask stripping is deferred until the filesystem calls * posix_acl_create(). * * Some filesystems (like NFSv4) also want to avoid umask stripping by the * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK * to get this effect without declaring that they support POSIX ACLs. * * Returns: mode */ static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode) { if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK)) mode &= ~current_umask(); return mode; } extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { ((char *) name)[min(len, maxlen)] = '\0'; } /** * retry_estale - determine whether the caller should retry an operation * @error: the error that would currently be returned * @flags: flags being used for next lookup attempt * * Check to see if the error code was -ESTALE, and then determine whether * to retry the call based on whether "flags" already has LOOKUP_REVAL set. * * Returns true if the caller should try the operation again. */ static inline bool retry_estale(const long error, const unsigned int flags) { return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL)); } #endif /* _LINUX_NAMEI_H */ |
139 2133 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned long __nr_free_highpages(void); unsigned long __totalhigh_pages(void); static inline unsigned long nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return __totalhigh_pages(); } static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || (addr >= __fix_to_virt(FIX_KMAP_END) && addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned long nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0; } static inline bool is_kmap_addr(const void *x) { return false; } #endif /* CONFIG_HIGHMEM */ /** * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables * pagefaults. Depending on PREEMP_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. * See kmap_local_page() for details on nesting. * * @__addr can be any address within the mapped page, so there is no need * to subtract any offset that has been added. In contrast to kunmap(), * this function takes the address returned from kmap_atomic(), not the * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) /** * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * * @__addr can be any address within the mapped page. Commonly it is the * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif |
342 5 2 1 1 5 5 5 30 29 365 127 365 365 111 24 84 60 111 111 23 24 5 24 24 24 24 111 111 6 111 100 63 29 63 63 63 63 111 111 111 111 111 111 100 111 54 23 110 23 23 23 111 111 111 100 56 24 24 24 24 41 256 311 310 311 25 25 25 7 20 24 24 24 24 278 239 278 278 38 240 38 38 38 239 278 277 278 244 243 244 243 170 113 114 170 24 24 24 24 24 24 23 282 342 3 3 3 3 219 218 2 219 114 25 12 25 24 5 5 25 10 8 5 5 2 2 8 8 8 342 342 342 281 147 342 22 128 128 281 293 342 1 4792 1 4786 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> #include <linux/plist.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry. */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct folio *folio; int ret = 0; folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) return 0; /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_folio_sector(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, unsigned int flag) { info->flags = flag; } static inline unsigned int cluster_count(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_count(struct swap_cluster_info *info, unsigned int c) { info->data = c; } static inline void cluster_set_count_flag(struct swap_cluster_info *info, unsigned int c, unsigned int f) { info->flags = f; info->data = c; } static inline unsigned int cluster_next(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_next(struct swap_cluster_info *info, unsigned int n) { info->data = n; } static inline void cluster_set_next_flag(struct swap_cluster_info *info, unsigned int n, unsigned int f) { info->flags = f; info->data = n; } static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } static inline bool cluster_is_null(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_NEXT_NULL; } static inline void cluster_set_null(struct swap_cluster_info *info) { info->flags = CLUSTER_FLAG_NEXT_NULL; info->data = 0; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = si->cluster_info; if (ci) { ci += offset / SWAPFILE_CLUSTER; spin_lock(&ci->lock); } return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { if (ci) spin_unlock(&ci->lock); } /* * Determine the locking method in use for this device. Return * swap_cluster_info if SSD-style cluster-based locking is in place. */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); return ci; } static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, struct swap_cluster_info *ci) { if (ci) unlock_cluster(ci); else spin_unlock(&si->lock); } static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); } static inline unsigned int cluster_list_first(struct swap_cluster_list *list) { return cluster_next(&list->head); } static void cluster_list_init(struct swap_cluster_list *list) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } static void cluster_list_add_tail(struct swap_cluster_list *list, struct swap_cluster_info *ci, unsigned int idx) { if (cluster_list_empty(list)) { cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { struct swap_cluster_info *ci_tail; unsigned int tail = cluster_next(&list->tail); /* * Nested cluster lock, but both cluster locks are * only acquired when we held swap_info_struct->lock */ ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } static unsigned int cluster_list_del_first(struct swap_cluster_list *list, struct swap_cluster_info *ci) { unsigned int idx; idx = cluster_next(&list->head); if (cluster_next(&list->tail) == idx) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } else cluster_set_next_flag(&list->head, cluster_next(&ci[idx]), 0); return idx; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) { /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); cluster_list_add_tail(&si->free_clusters, ci, idx); } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *info, *ci; unsigned int idx; info = si->cluster_info; while (!cluster_list_empty(&si->discard_clusters)) { idx = cluster_list_del_first(&si->discard_clusters, info); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); } } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); spin_lock(&si->lock); swap_do_scheduled_discard(si); spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); cluster_list_del_first(&si->free_clusters, ci); cluster_set_count_flag(ci + idx, 0, 0); } static void free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info + idx; VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, idx); return; } __free_cluster(si, idx); } /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased by * count. */ static void add_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr, unsigned long count) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) + count); } /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased by 1. */ static void inc_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { add_cluster_info_page(p, cluster_info, page_nr, 1); } /* * The cluster corresponding to page_nr decreases one usage. If the usage * counter becomes 0, which means no page in the cluster is in using, we can * optionally discard the cluster and add it to free cluster list. */ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); if (cluster_count(&cluster_info[idx]) == 0) free_cluster(p, idx); } /* * It's possible scan_swap_map_slots() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, unsigned long offset, int order) { struct percpu_cluster *percpu_cluster; bool conflict; offset /= SWAPFILE_CLUSTER; conflict = !cluster_list_empty(&si->free_clusters) && offset != cluster_list_first(&si->free_clusters) && cluster_is_free(&si->cluster_info[offset]); if (!conflict) return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); percpu_cluster->next[order] = SWAP_NEXT_INVALID; return true; } static inline bool swap_range_empty(char *swap_map, unsigned int start, unsigned int nr_pages) { unsigned int i; for (i = 0; i < nr_pages; i++) { if (swap_map[start + i]) return false; } return true; } /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base, int order) { unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); tmp = cluster->next[order]; if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { tmp = cluster_next(&si->free_clusters.head) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); *scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; goto new_cluster; } else return false; } /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster, maintaining * natural alignment. */ max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { if (swap_range_empty(si->swap_map, tmp, nr_pages)) break; tmp += nr_pages; } unlock_cluster(ci); } if (tmp >= max) { cluster->next[order] = SWAP_NEXT_INVALID; goto new_cluster; } *offset = tmp; *scan_base = tmp; tmp += nr_pages; cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; return true; } static void __del_from_avail_list(struct swap_info_struct *p) { int nid; assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } static void del_from_avail_list(struct swap_info_struct *p) { spin_lock(&swap_avail_lock); __del_from_avail_list(p); spin_unlock(&swap_avail_lock); } static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned int end = offset + nr_entries - 1; if (offset == si->lowest_bit) si->lowest_bit += nr_entries; if (end == si->highest_bit) WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); } } static void add_to_avail_list(struct swap_info_struct *p) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); spin_unlock(&swap_avail_lock); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); if (offset < si->lowest_bit) si->lowest_bit = offset; if (end > si->highest_bit) { bool was_full = !si->highest_bit; WRITE_ONCE(si->highest_bit, end); if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done. */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) { unsigned long prev; if (!(si->flags & SWP_SOLIDSTATE)) { si->cluster_next = next; return; } prev = this_cpu_read(*si->cluster_next_cpu); /* * Cross the swap address space size aligned trunk, choose * another trunk randomly to avoid lock contention on swap * address space if possible. */ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != (next >> SWAP_ADDRESS_SPACE_SHIFT)) { /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } this_cpu_write(*si->cluster_next_cpu, next); } static bool swap_offset_available_and_locked(struct swap_info_struct *si, unsigned long offset) { if (data_race(!si->swap_map[offset])) { spin_lock(&si->lock); return true; } if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { spin_lock(&si->lock); return true; } return false; } static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ if (order > 0) { /* * Should not even be attempting large allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP) || nr_pages > SWAPFILE_CLUSTER) { VM_WARN_ON_ONCE(1); return 0; } /* * Swapfile is not block device or not using clusters so unable * to allocate large entries. */ if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) return 0; } si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on * cluster and swap cache. For HDD, sequential access is more * important. */ if (si->flags & SWP_SOLIDSTATE) scan_base = this_cpu_read(*si->cluster_next_cpu); else scan_base = si->cluster_next; offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { if (order > 0) goto no_page; goto scan; } } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; } checks: if (si->cluster_info) { while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { if (order > 0) goto no_page; goto scan; } } } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) goto checks; goto scan; /* check next one */ } if (si->swap_map[offset]) { unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); add_cluster_info_page(si, si->cluster_info, offset, nr_pages); unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ if ((n_ret == nr) || (offset >= si->highest_bit)) goto done; /* search for next available slot */ /* time to take a break? */ if (unlikely(--latency_ration < 0)) { if (n_ret) goto done; spin_unlock(&si->lock); cond_resched(); spin_lock(&si->lock); latency_ration = LATENCY_LIMIT; } /* try to get more slots in cluster */ if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) goto checks; if (order > 0) goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } /* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } } done: if (order == 0) set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; } offset = si->lowest_bit; while (offset < scan_base) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; offset++; } spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; return n_ret; } static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; struct swap_cluster_info *ci; ci = lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } WARN(!si->highest_bit, "swap_info %d in list but !highest_bit\n", si->type); WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries, order); spin_unlock(&si->lock); if (n_ret || size > 1) goto check_out; cond_resched(); spin_lock(&swap_avail_lock); nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; } spin_unlock(&swap_avail_lock); check_out: if (n_ret < n_goal) atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset; if (!entry.val) goto out; p = swp_swap_info(entry); if (!p) goto bad_nofile; if (data_race(!(p->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; if (data_race(!p->swap_map[swp_offset(entry)])) goto bad_free; return p; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p != q) { if (q != NULL) spin_unlock(&q->lock); if (p != NULL) spin_lock(&p->lock); } return p; } static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; } else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(p, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } usage = count | has_cache; if (usage) WRITE_ONCE(p->swap_map[offset], usage); else WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); return usage; } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!percpu_ref_tryget_live(&si->users)) goto out; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(). */ smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); return usage; } static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; ci = lock_cluster(p, offset); count = p->swap_map[offset]; VM_BUG_ON(count != SWAP_HAS_CACHE); p->swap_map[offset] = 0; dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, 1); swap_range_free(p, offset, 1); } static void cluster_swap_free_nr(struct swap_info_struct *sis, unsigned long offset, int nr_pages) { struct swap_cluster_info *ci; DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; int i, nr; ci = lock_cluster_or_swap_info(sis, offset); while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { if (!__swap_entry_free_locked(sis, offset + i, 1)) bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { unlock_cluster_or_swap_info(sis, ci); for_each_set_bit(i, to_free, BITS_PER_LONG) free_swap_slot(swp_entry(sis->type, offset + i)); if (nr == nr_pages) return; bitmap_clear(to_free, 0, BITS_PER_LONG); ci = lock_cluster_or_swap_info(sis, offset); } offset += nr; nr_pages -= nr; } unlock_cluster_or_swap_info(sis, ci); } /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free_nr(swp_entry_t entry, int nr_pages) { int nr; struct swap_info_struct *sis; unsigned long offset = swp_offset(entry); sis = _swap_info_get(entry); if (!sis) return; while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); cluster_swap_free_nr(sis, offset, nr); offset += nr; nr_pages -= nr; } } /* * Called after dropping swapcache to decrease refcnt to swap entries. */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) return; ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; VM_BUG_ON(!(val & SWAP_HAS_CACHE)); if (val == SWAP_HAS_CACHE) free_entries++; } if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); return; } } for (i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); if (i == size - 1) return; lock_cluster_or_swap_info(si, offset); } } unlock_cluster_or_swap_info(si, ci); } static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; return (int)swp_type(*e1) - (int)swp_type(*e2); } void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; int i; if (n <= 0) return; prev = NULL; p = NULL; /* * Sort swap entries by swap device, so each lock is only taken once. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is * so low that it isn't necessary to optimize further. */ if (nr_swapfiles > 1) sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_free(p, entries[i]); prev = p; } if (p) spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster_or_swap_info(si, ci); return count; } /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; p = _swap_info_get(entry); if (!p) return 0; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = swap_count(p->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { page = list_next_entry(page, lru); map = kmap_local_page(page); tmp_count = map[offset]; kunmap_local(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: unlock_cluster_or_swap_info(p, ci); return count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); if (!ci || nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; } } unlock_out: unlock_cluster_or_swap_info(si, ci); return ret; } static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; if (folio_swapped(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; delete_from_swap_cache(folio); folio_set_dirty(folio); return true; } /** * free_swap_and_cache_nr() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). */ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; unsigned int type = swp_type(entry); struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; unsigned char count; if (non_swap_entry(entry)) return; si = get_swap_device(entry); if (!si) return; if (WARN_ON(end_offset > si->max)) goto out; /* * First free all entries in the range. */ for (offset = start_offset; offset < end_offset; offset++) { if (data_race(si->swap_map[offset])) { count = __swap_entry_free(si, swp_entry(type, offset)); if (count == SWAP_HAS_CACHE) any_only_cache = true; } else { WARN_ON_ONCE(1); } } /* * Short-circuit the below loop if none of the entries had their * reference drop to zero. */ if (!any_only_cache) goto out; /* * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { /* * Folios are always naturally aligned in swap so * advance forward to the next boundary. Zero means no * folio was found for the swap entry, so advance by 1 * in this case. Negative value means folio was found * but could not be reclaimed. Here we can still advance * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) nr = -nr; nr = ALIGN(offset + 1, nr) - offset; } } out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: return entry; } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page; struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1; swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { hwpoisoned = true; folio = swapcache; } page = folio_file_page(folio, swp_offset(entry)); if (PageHWPoison(page)) hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), swp_entry_to_pte(entry)))) { ret = 0; goto out; } old_pte = ptep_get(pte); if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; goto setpte; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_get(folio); if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked. */ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful. */ if (!folio_test_anon(folio)) { VM_WARN_ON_ONCE(folio_test_large(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); } else { folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: if (pte) pte_unmap_unlock(pte, ptl); if (folio != swapcache) { folio_unlock(folio); folio_put(folio); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { pte_t *pte = NULL; struct swap_info_struct *si; si = swap_info[type]; do { struct folio *folio; unsigned long offset; unsigned char swp_count; swp_entry_t entry; int ret; pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) break; } ptent = ptep_get_lockless(pte); if (!is_swap_pte(ptent)) continue; entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); pte = NULL; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); if (page) folio = page_folio(page); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) continue; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } while (addr += PAGE_SIZE, addr != end); if (pte) pte_unmap(pte); return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } mmap_read_unlock(mm); return ret; } /* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned char count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!READ_ONCE(si->inuse_pages)) goto success; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writepage() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; return -EINTR; } success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0. */ smp_mb(); return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static int swap_node(struct swap_info_struct *p) { struct block_device *bdev; if (p->bdev) bdev = p->bdev; else bdev = p->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } static void setup_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { int i; if (prio >= 0) p->prio = prio; else p->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; for_each_node(i) { if (p->prio >= 0) p->avail_lists[i].prio = -p->prio; else { if (swap_node(p) == i) p->avail_lists[i].prio = 1; else p->avail_lists[i].prio = -p->prio; } } p->swap_map = swap_map; p->cluster_info = cluster_info; } static void _enable_swap_info(struct swap_info_struct *p) { p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; assert_spin_locked(&swap_lock); /* * both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ plist_add(&p->list, &swap_active_head); /* add to available list iff swap device is not full */ if (p->highest_bit) add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ percpu_ref_resurrect(&p->users); spin_lock(&swap_lock); spin_lock(&p->lock); _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } static bool __has_usable_swap(void) { return !plist_head_empty(&swap_active_head); } bool has_usable_swap(void) { bool ret; spin_lock(&swap_lock); ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; struct filename *pathname; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); pathname = getname(specialfile); if (IS_ERR(pathname)) return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } } least_priority++; } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); disable_swap_slots_cache_lock(); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); reenable_swap_slots_cache_unlock(); goto out_dput; } reenable_swap_slots_cache_unlock(); /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap * operations protected by RCU reader side lock (including any * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map_slots */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); spin_lock(&p->lock); } swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); inode = mapping->host; inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); out: putname(pathname); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = K(si->pages); inuse = K(READ_ONCE(si->inuse_pages)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; int i; p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { p->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(p->bdev)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } p->lowest_bit = 1; p->cluster_next = 1; p->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } p->highest_bit = maxpages - 1; if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } #define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) #define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { unsigned int j, k; unsigned int nr_good_pages; int nr_extents; unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ cluster_list_init(&p->free_clusters); cluster_list_init(&p->discard_clusters); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; /* * Haven't marked the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, page_nr); } } /* Haven't marked the cluster free yet, no list operation involved */ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(p, cluster_info, i); if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; /* * Not mark the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); if (nr_extents < 0) return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } if (!cluster_info) return nr_extents; /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { idx = i * SWAP_CLUSTER_COLS + j; if (idx >= nr_clusters) continue; if (cluster_count(&cluster_info[idx])) continue; cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } } return nr_extents; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; struct swap_cluster_info *cluster_info = NULL; struct page *page = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!swap_avail_heads) return -ENOMEM; p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); INIT_WORK(&p->discard_work, swap_discard_work); name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } p->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } page = read_mapping_page(mapping, 0, swap_file); if (IS_ERR(page)) { error = PTR_ERR(page); goto bad_swap_unlock_inode; } swap_header = kmap(page); maxpages = read_swap_header(p, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap_unlock_inode; } if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; if (p->bdev && bdev_synchronous(p->bdev)) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { int cpu, i; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; p->cluster_next_cpu = alloc_percpu(unsigned int); if (!p->cluster_next_cpu) { error = -ENOMEM; goto bad_swap_unlock_inode; } /* * select a random position to start with to help wear leveling * SSD */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = get_random_u32_inclusive(1, p->highest_bit); } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; goto bad_swap_unlock_inode; } for (ci = 0; ci < nr_cluster; ci++) spin_lock_init(&((cluster_info + ci)->lock)); p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; goto bad_swap_unlock_inode; } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; cluster = per_cpu_ptr(p->percpu_cluster, cpu); for (i = 0; i < SWAP_NR_ORDERS; i++) cluster->next[i] = SWAP_NEXT_INVALID; } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap_unlock_inode; } if ((swap_flags & SWAP_FLAG_DISCARD) && p->bdev && bdev_max_discard_sectors(p->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) p->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) p->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (p->flags & SWP_AREA_DISCARD) { int err = discard_swap(p); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", p, err); } } error = init_swap_address_space(p->type, maxpages); if (error) goto bad_swap_unlock_inode; error = zswap_swapon(p->type, maxpages); if (error) goto free_swap_address_space; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_zswap; } mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(p->pages), name->name, p->prio, nr_extents, K((unsigned long long)span), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", (p->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_zswap: zswap_swapoff(p->type); free_swap_address_space: exit_swap_address_space(p->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; inode = NULL; destroy_swap_extents(p); swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (page && !IS_ERR(page)) { kunmap(page); put_page(page); } if (name) putname(name); if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += READ_ONCE(si->inuse_pages); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } /* * Verify that a swap entry is valid and increment its swap map count. * * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; int err; p = swp_swap_info(entry); offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; /* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ if (!has_cache && count) has_cache = SWAP_HAS_CACHE; else if (has_cache) /* someone else added cache */ err = -EEXIST; else /* no users remaining */ err = -ENOENT; } else if (count || has_cache) { if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) err = -EINVAL; else if (swap_count_continued(p, offset, count)) count = COUNT_CONTINUED; else err = -ENOMEM; } else err = -ENOENT; /* unused swap entry */ if (!err) WRITE_ONCE(p->swap_map[offset], count | has_cache); unlock_out: unlock_cluster_or_swap_info(p, ci); return err; } /* * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry) { __swap_duplicate(entry, SWAP_MAP_SHMEM); } /* * Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { int err = 0; while (!err && __swap_duplicate(entry, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* * @entry: swap entry for which we allocate swap cache. * * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { return __swap_duplicate(entry, SWAP_HAS_CACHE); } void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster_or_swap_info(si, offset); usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); unlock_cluster_or_swap_info(si, ci); if (!usage) free_swap_slot(entry); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); } /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing * __swap_duplicate(): the swap device may be swapoff */ goto outer; } spin_lock(&si->lock); offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { ret = -ENOMEM; goto out; } head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; map = kmap_local_page(list_page) + offset; count = *map; kunmap_local(map); /* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); spin_unlock(&si->lock); put_swap_device(si); outer: if (page) __free_page(page); return ret; } /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); map = kmap_local_page(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_local(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } map = kmap_local_page(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = COUNT_CONTINUED; kunmap_local(map); } ret = true; /* incremented */ } else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_local(map); } ret = count == COUNT_CONTINUED; } out: spin_unlock(&si->cont_lock); return ret; } /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); __free_page(page); } } } } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; if (!__has_usable_swap()) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_disk) return; spin_lock(&swap_avail_lock); plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled\n"); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); swapfile_maximum_size = arch_max_swapfile_size(); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init); |
11 11 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ #include "bat_v_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "bat_algo.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" /** * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * * Return: the orig_node corresponding to the specified address. If such an * object does not exist, it is allocated here. In case of allocation failure * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; } return orig_node; } /** * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) { unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ msecs += get_random_u32_below(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } /** * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) { unsigned long msecs; /* this function may be invoked in different contexts (ogm rescheduling * or hard_iface activation), but the work timer should not be reset */ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } /** * batadv_v_ogm_len() - OGMv2 packet length * @skb: the OGM to check * * Return: Length of the given OGMv2 packet, including tvlv length, excluding * ethernet header length. */ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) { struct batadv_ogm2_packet *ogm_packet; ogm_packet = (struct batadv_ogm2_packet *)skb->data; return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len); } /** * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } /** * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a * single OGMv2 packet and transmits this aggregate. * * The aggregation queue is empty after this call. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; unsigned int ogm_len; struct sk_buff *skb; lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN); if (!skb_aggr) { batadv_v_ogm_aggr_list_free(hard_iface); return; } skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); skb_put_data(skb_aggr, skb->data, ogm_len); consume_skb(skb); } batadv_v_ogm_send_to_if(skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); if (!atomic_read(&bat_priv->aggregated_ogms)) { batadv_v_ogm_send_to_if(skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_send_softif() - periodic worker broadcasting the own OGM * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; int ogm_buff_len; u16 tvlv_len = 0; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; ogm_buff = bat_priv->bat_v.ogm_buff; ogm_buff_len = bat_priv->bat_v.ogm_buff_len; /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, &ogm_buff_len, BATADV_OGM2_HLEN); bat_priv->bat_v.ogm_buff = ogm_buff; bat_priv->bat_v.ogm_buff_len = ogm_buff_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); skb_put_data(skb, ogm_buff, ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); atomic_inc(&bat_priv->bat_v.ogm_seqno); ogm_packet->tvlv_len = htons(tvlv_len); /* broadcast on every interface */ rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselves on %s suppressed: %s\n", hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), ntohl(ogm_packet->throughput), ogm_packet->ttl, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); if (!skb_tmp) { batadv_hardif_put(hard_iface); break; } batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); consume_skb(skb); reschedule: batadv_v_ogm_start_timer(bat_priv); out: return; } /** * batadv_v_ogm_send() - periodic worker broadcasting the own OGM * @work: work queue item */ static void batadv_v_ogm_send(struct work_struct *work) { struct batadv_priv_bat_v *bat_v; struct batadv_priv *bat_priv; bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); bat_priv = container_of(bat_v, struct batadv_priv, bat_v); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); batadv_v_ogm_send_softif(bat_priv); mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } /** * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); return 0; } /** * batadv_v_ogm_iface_disable() - release OGM interface private resources * @hard_iface: interface for which the resources have to be released */ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); struct batadv_ogm2_packet *ogm_packet; mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); unlock: mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_forward_penalty() - apply a penalty to the throughput metric * forwarded with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the soft interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the * characteristic of the interface where the OGM has been received. * * Initially the per hardif hop penalty is applied to the throughput. After * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, u32 throughput) { int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; /* Apply per hardif hop penalty */ throughput = throughput * (hop_penalty_max - if_hop_penalty) / hop_penalty_max; /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; /* Forwarding on the same WiFi interface cuts the throughput in half * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ if (throughput > 10 && if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; /* hop penalty of 255 equals 100% */ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; } /** * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface * @bat_priv: the bat priv with all the soft interface information * @ogm_received: previously received OGM to be forwarded * @orig_node: the originator which has been updated * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * * Forward an OGM to an interface after having altered the throughput metric and * the TTL value contained in it. The original OGM isn't modified. */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; /* only forward for specific interfaces, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; /* acquire possibly updated router */ router = batadv_orig_router_get(orig_node, if_outgoing); /* strict rule: forward packets coming from the best next hop only */ if (neigh_node != router) goto out; /* don't forward the same seqno twice on one interface */ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) goto out; orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); goto out; } neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put_data(skb, ogm_received, packet_len); /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_queue_on_if(skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); } /** * batadv_v_ogm_metric_update() - update route metric based on OGM * @bat_priv: the bat priv with all the soft interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: * 1 if the OGM is new, * 0 if it is not new but valid, * <0 on error (e.g. old OGM) */ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; bool protection_started = false; int ret = -EINVAL; u32 path_throughput; s32 seq_diff; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_OGM_MAX_AGE, &orig_ifinfo->batman_seqno_reset, &protection_started)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within window protection time from %pM\n", ogm2->orig); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Last reset: %ld, %ld\n", orig_ifinfo->batman_seqno_reset, jiffies); goto out; } /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); orig_ifinfo->last_ttl = ogm2->ttl; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, if_outgoing, ntohl(ogm2->throughput)); neigh_ifinfo->bat_v.throughput = path_throughput; neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); neigh_ifinfo->last_ttl = ogm2->ttl; if (seq_diff > 0 || protection_started) ret = 1; else ret = 0; out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } /** * batadv_v_ogm_route_update() - update routes based on OGM * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: true if the packet should be forwarded, false otherwise */ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; u32 router_throughput, neigh_throughput; u32 router_last_seqno; u32 neigh_last_seqno; s32 neigh_seq_diff; bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router && router->orig_node != orig_node && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } /* Mark the OGM to be considered for forwarding, and update routes * if needed. */ forward = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Searching and updating originator entry of received packet\n"); /* if this neighbor already is our next hop there is nothing * to change */ if (router == neigh_node) goto out; /* don't consider neighbours with worse throughput. * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than * the last received seqno from our best next hop. */ if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); /* if these are not allocated, something is wrong. */ if (!router_ifinfo || !neigh_ifinfo) goto out; neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; router_last_seqno = router_ifinfo->bat_v.last_seqno; neigh_seq_diff = neigh_last_seqno - router_last_seqno; router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && router_throughput >= neigh_throughput) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: batadv_neigh_node_put(router); batadv_neigh_node_put(orig_neigh_router); batadv_orig_node_put(orig_neigh_node); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } /** * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { int seqno_age; bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* outdated sequence numbers are to be discarded */ if (seqno_age < 0) return; /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* if the routes have been processed correctly, check and forward */ if (forward) batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); } /** * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm2_packet: potential OGM2 in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm2_packet *ogm2_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm2_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); } /** * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; u32 ogm_throughput, link_throughput, path_throughput; int ret; ethhdr = eth_hdr(skb); ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); ogm_throughput = ntohl(ogm_packet->throughput); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, throughput %u, TTL %u, V %u, tvlv_len %u)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, ogm_packet->version, ntohs(ogm_packet->tvlv_len)); if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from ourself\n"); return; } /* If the throughput metric is 0, immediately drop the packet. No need * to create orig_node / neigh_node for an unusable route. */ if (ogm_throughput == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with throughput metric of 0\n"); return; } /* require ELP packets be to received from this neighbor first */ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (!hardif_neigh) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) goto out; neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, ethhdr->h_source); if (!neigh_node) goto out; /* Update the received throughput metric to match the link * characteristic: * - If this OGM traveled one hop so far (emitted by single hop * neighbor) the path throughput metric equals the link throughput. * - For OGMs traversing more than hop the path throughput metric is * the smaller of the path throughput and the link throughput. */ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); path_throughput = min_t(u32, link_throughput, ogm_throughput); ogm_packet->throughput = htonl(path_throughput); batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, ogm_packet->orig, hardif_neigh->orig); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s suppressed: %s\n", ogm_packet->orig, hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: batadv_orig_node_put(orig_node); batadv_neigh_node_put(neigh_node); batadv_hardif_neigh_put(hardif_neigh); } /** * batadv_v_ogm_packet_recv() - OGM2 receiving handler * @skb: the received OGM * @if_incoming: the interface where this OGM has been received * * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP * (without freeing the skb) on failure */ int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm2_packet *ogm_packet; struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) goto free_skb; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm2_packet *)skb->data; while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_v_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM2_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm2_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_v_ogm_init() - initialise the OGM2 engine * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or a negative error code in case of failure */ int batadv_v_ogm_init(struct batadv_priv *bat_priv) { struct batadv_ogm2_packet *ogm_packet; unsigned char *ogm_buff; u32 random_seqno; bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) return -ENOMEM; bat_priv->bat_v.ogm_buff = ogm_buff; ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; ogm_packet->packet_type = BATADV_OGM2; ogm_packet->version = BATADV_COMPAT_VERSION; ogm_packet->ttl = BATADV_TTL; ogm_packet->flags = BATADV_NO_FLAGS; ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); mutex_init(&bat_priv->bat_v.ogm_buff_mutex); return 0; } /** * batadv_v_ogm_free() - free OGM private resources * @bat_priv: the bat priv with all the soft interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } |
2 1 15 15 15 15 14 15 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_VMCS12_H #define __KVM_X86_VMX_VMCS12_H #include <linux/build_bug.h> #include "vmcs.h" /* * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a * single nested guest (L2), hence the name vmcs12. Any VMX implementation has * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is * stored in guest memory specified by VMPTRLD, but is opaque to the guest, * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * More than one of these structures may exist, if L1 runs multiple L2 guests. * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). * * IMPORTANT: Changing the layout of existing fields in this structure * will break save/restore compatibility with older kvm releases. When * adding new fields, either use space in the reserved padding* arrays * or add the new fields to the end of the structure. */ typedef u64 natural_width; struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ u32 padding[7]; /* room for future expansion */ u64 io_bitmap_a; u64 io_bitmap_b; u64 msr_bitmap; u64 vm_exit_msr_store_addr; u64 vm_exit_msr_load_addr; u64 vm_entry_msr_load_addr; u64 tsc_offset; u64 virtual_apic_page_addr; u64 apic_access_addr; u64 posted_intr_desc_addr; u64 ept_pointer; u64 eoi_exit_bitmap0; u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; u64 guest_ia32_perf_global_ctrl; u64 guest_pdptr0; u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; u64 vmread_bitmap; u64 vmwrite_bitmap; u64 vm_function_control; u64 eptp_list_address; u64 pml_address; u64 encls_exiting_bitmap; u64 tsc_multiplier; u64 padding64[1]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have * unsigned long fields with no explicit size. We use u64 (aliased * natural_width) instead. Luckily, x86 is little-endian. */ natural_width cr0_guest_host_mask; natural_width cr4_guest_host_mask; natural_width cr0_read_shadow; natural_width cr4_read_shadow; natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */ natural_width exit_qualification; natural_width guest_linear_address; natural_width guest_cr0; natural_width guest_cr3; natural_width guest_cr4; natural_width guest_es_base; natural_width guest_cs_base; natural_width guest_ss_base; natural_width guest_ds_base; natural_width guest_fs_base; natural_width guest_gs_base; natural_width guest_ldtr_base; natural_width guest_tr_base; natural_width guest_gdtr_base; natural_width guest_idtr_base; natural_width guest_dr7; natural_width guest_rsp; natural_width guest_rip; natural_width guest_rflags; natural_width guest_pending_dbg_exceptions; natural_width guest_sysenter_esp; natural_width guest_sysenter_eip; natural_width host_cr0; natural_width host_cr3; natural_width host_cr4; natural_width host_fs_base; natural_width host_gs_base; natural_width host_tr_base; natural_width host_gdtr_base; natural_width host_idtr_base; natural_width host_ia32_sysenter_esp; natural_width host_ia32_sysenter_eip; natural_width host_rsp; natural_width host_rip; natural_width paddingl[8]; /* room for future expansion */ u32 pin_based_vm_exec_control; u32 cpu_based_vm_exec_control; u32 exception_bitmap; u32 page_fault_error_code_mask; u32 page_fault_error_code_match; u32 cr3_target_count; u32 vm_exit_controls; u32 vm_exit_msr_store_count; u32 vm_exit_msr_load_count; u32 vm_entry_controls; u32 vm_entry_msr_load_count; u32 vm_entry_intr_info_field; u32 vm_entry_exception_error_code; u32 vm_entry_instruction_len; u32 tpr_threshold; u32 secondary_vm_exec_control; u32 vm_instruction_error; u32 vm_exit_reason; u32 vm_exit_intr_info; u32 vm_exit_intr_error_code; u32 idt_vectoring_info_field; u32 idt_vectoring_error_code; u32 vm_exit_instruction_len; u32 vmx_instruction_info; u32 guest_es_limit; u32 guest_cs_limit; u32 guest_ss_limit; u32 guest_ds_limit; u32 guest_fs_limit; u32 guest_gs_limit; u32 guest_ldtr_limit; u32 guest_tr_limit; u32 guest_gdtr_limit; u32 guest_idtr_limit; u32 guest_es_ar_bytes; u32 guest_cs_ar_bytes; u32 guest_ss_ar_bytes; u32 guest_ds_ar_bytes; u32 guest_fs_ar_bytes; u32 guest_gs_ar_bytes; u32 guest_ldtr_ar_bytes; u32 guest_tr_ar_bytes; u32 guest_interruptibility_info; u32 guest_activity_state; u32 guest_sysenter_cs; u32 host_ia32_sysenter_cs; u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; u16 posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector; u16 guest_ss_selector; u16 guest_ds_selector; u16 guest_fs_selector; u16 guest_gs_selector; u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; u16 host_ds_selector; u16 host_fs_selector; u16 host_gs_selector; u16 host_tr_selector; u16 guest_pml_index; }; /* * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's * to KVM's virtual CPU definition. * * DO NOT change this value, as it will break save/restore compatibility with * older KVM releases. */ #define VMCS12_REVISION 0x11e57ed0 /* * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region * and any VMCS region. Although only sizeof(struct vmcs12) are used by the * current implementation, 4K are reserved to avoid future complications and * to preserve userspace ABI. */ #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* * For save/restore compatibility, the vmcs12 field offsets must not change, * although appending fields and/or filling gaps is obviously allowed. */ #define CHECK_OFFSET(field, loc) \ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) static inline void vmx_check_vmcs12_offsets(void) { CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); CHECK_OFFSET(io_bitmap_b, 48); CHECK_OFFSET(msr_bitmap, 56); CHECK_OFFSET(vm_exit_msr_store_addr, 64); CHECK_OFFSET(vm_exit_msr_load_addr, 72); CHECK_OFFSET(vm_entry_msr_load_addr, 80); CHECK_OFFSET(tsc_offset, 88); CHECK_OFFSET(virtual_apic_page_addr, 96); CHECK_OFFSET(apic_access_addr, 104); CHECK_OFFSET(posted_intr_desc_addr, 112); CHECK_OFFSET(ept_pointer, 120); CHECK_OFFSET(eoi_exit_bitmap0, 128); CHECK_OFFSET(eoi_exit_bitmap1, 136); CHECK_OFFSET(eoi_exit_bitmap2, 144); CHECK_OFFSET(eoi_exit_bitmap3, 152); CHECK_OFFSET(xss_exit_bitmap, 160); CHECK_OFFSET(guest_physical_address, 168); CHECK_OFFSET(vmcs_link_pointer, 176); CHECK_OFFSET(guest_ia32_debugctl, 184); CHECK_OFFSET(guest_ia32_pat, 192); CHECK_OFFSET(guest_ia32_efer, 200); CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); CHECK_OFFSET(guest_pdptr0, 216); CHECK_OFFSET(guest_pdptr1, 224); CHECK_OFFSET(guest_pdptr2, 232); CHECK_OFFSET(guest_pdptr3, 240); CHECK_OFFSET(guest_bndcfgs, 248); CHECK_OFFSET(host_ia32_pat, 256); CHECK_OFFSET(host_ia32_efer, 264); CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); CHECK_OFFSET(vmread_bitmap, 280); CHECK_OFFSET(vmwrite_bitmap, 288); CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(tsc_multiplier, 328); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); CHECK_OFFSET(cr4_read_shadow, 368); CHECK_OFFSET(dead_space, 376); CHECK_OFFSET(exit_qualification, 408); CHECK_OFFSET(guest_linear_address, 416); CHECK_OFFSET(guest_cr0, 424); CHECK_OFFSET(guest_cr3, 432); CHECK_OFFSET(guest_cr4, 440); CHECK_OFFSET(guest_es_base, 448); CHECK_OFFSET(guest_cs_base, 456); CHECK_OFFSET(guest_ss_base, 464); CHECK_OFFSET(guest_ds_base, 472); CHECK_OFFSET(guest_fs_base, 480); CHECK_OFFSET(guest_gs_base, 488); CHECK_OFFSET(guest_ldtr_base, 496); CHECK_OFFSET(guest_tr_base, 504); CHECK_OFFSET(guest_gdtr_base, 512); CHECK_OFFSET(guest_idtr_base, 520); CHECK_OFFSET(guest_dr7, 528); CHECK_OFFSET(guest_rsp, 536); CHECK_OFFSET(guest_rip, 544); CHECK_OFFSET(guest_rflags, 552); CHECK_OFFSET(guest_pending_dbg_exceptions, 560); CHECK_OFFSET(guest_sysenter_esp, 568); CHECK_OFFSET(guest_sysenter_eip, 576); CHECK_OFFSET(host_cr0, 584); CHECK_OFFSET(host_cr3, 592); CHECK_OFFSET(host_cr4, 600); CHECK_OFFSET(host_fs_base, 608); CHECK_OFFSET(host_gs_base, 616); CHECK_OFFSET(host_tr_base, 624); CHECK_OFFSET(host_gdtr_base, 632); CHECK_OFFSET(host_idtr_base, 640); CHECK_OFFSET(host_ia32_sysenter_esp, 648); CHECK_OFFSET(host_ia32_sysenter_eip, 656); CHECK_OFFSET(host_rsp, 664); CHECK_OFFSET(host_rip, 672); CHECK_OFFSET(pin_based_vm_exec_control, 744); CHECK_OFFSET(cpu_based_vm_exec_control, 748); CHECK_OFFSET(exception_bitmap, 752); CHECK_OFFSET(page_fault_error_code_mask, 756); CHECK_OFFSET(page_fault_error_code_match, 760); CHECK_OFFSET(cr3_target_count, 764); CHECK_OFFSET(vm_exit_controls, 768); CHECK_OFFSET(vm_exit_msr_store_count, 772); CHECK_OFFSET(vm_exit_msr_load_count, 776); CHECK_OFFSET(vm_entry_controls, 780); CHECK_OFFSET(vm_entry_msr_load_count, 784); CHECK_OFFSET(vm_entry_intr_info_field, 788); CHECK_OFFSET(vm_entry_exception_error_code, 792); CHECK_OFFSET(vm_entry_instruction_len, 796); CHECK_OFFSET(tpr_threshold, 800); CHECK_OFFSET(secondary_vm_exec_control, 804); CHECK_OFFSET(vm_instruction_error, 808); CHECK_OFFSET(vm_exit_reason, 812); CHECK_OFFSET(vm_exit_intr_info, 816); CHECK_OFFSET(vm_exit_intr_error_code, 820); CHECK_OFFSET(idt_vectoring_info_field, 824); CHECK_OFFSET(idt_vectoring_error_code, 828); CHECK_OFFSET(vm_exit_instruction_len, 832); CHECK_OFFSET(vmx_instruction_info, 836); CHECK_OFFSET(guest_es_limit, 840); CHECK_OFFSET(guest_cs_limit, 844); CHECK_OFFSET(guest_ss_limit, 848); CHECK_OFFSET(guest_ds_limit, 852); CHECK_OFFSET(guest_fs_limit, 856); CHECK_OFFSET(guest_gs_limit, 860); CHECK_OFFSET(guest_ldtr_limit, 864); CHECK_OFFSET(guest_tr_limit, 868); CHECK_OFFSET(guest_gdtr_limit, 872); CHECK_OFFSET(guest_idtr_limit, 876); CHECK_OFFSET(guest_es_ar_bytes, 880); CHECK_OFFSET(guest_cs_ar_bytes, 884); CHECK_OFFSET(guest_ss_ar_bytes, 888); CHECK_OFFSET(guest_ds_ar_bytes, 892); CHECK_OFFSET(guest_fs_ar_bytes, 896); CHECK_OFFSET(guest_gs_ar_bytes, 900); CHECK_OFFSET(guest_ldtr_ar_bytes, 904); CHECK_OFFSET(guest_tr_ar_bytes, 908); CHECK_OFFSET(guest_interruptibility_info, 912); CHECK_OFFSET(guest_activity_state, 916); CHECK_OFFSET(guest_sysenter_cs, 920); CHECK_OFFSET(host_ia32_sysenter_cs, 924); CHECK_OFFSET(vmx_preemption_timer_value, 928); CHECK_OFFSET(virtual_processor_id, 960); CHECK_OFFSET(posted_intr_nv, 962); CHECK_OFFSET(guest_es_selector, 964); CHECK_OFFSET(guest_cs_selector, 966); CHECK_OFFSET(guest_ss_selector, 968); CHECK_OFFSET(guest_ds_selector, 970); CHECK_OFFSET(guest_fs_selector, 972); CHECK_OFFSET(guest_gs_selector, 974); CHECK_OFFSET(guest_ldtr_selector, 976); CHECK_OFFSET(guest_tr_selector, 978); CHECK_OFFSET(guest_intr_status, 980); CHECK_OFFSET(host_es_selector, 982); CHECK_OFFSET(host_cs_selector, 984); CHECK_OFFSET(host_ss_selector, 986); CHECK_OFFSET(host_ds_selector, 988); CHECK_OFFSET(host_fs_selector, 990); CHECK_OFFSET(host_gs_selector, 992); CHECK_OFFSET(host_tr_selector, 994); CHECK_OFFSET(guest_pml_index, 996); } extern const unsigned short vmcs12_field_offsets[]; extern const unsigned int nr_vmcs12_fields; static inline short get_vmcs12_field_offset(unsigned long field) { unsigned short offset; unsigned int index; if (field >> 15) return -ENOENT; index = ROL16(field, 6); if (index >= nr_vmcs12_fields) return -ENOENT; index = array_index_nospec(index, nr_vmcs12_fields); offset = vmcs12_field_offsets[index]; if (offset == 0) return -ENOENT; return offset; } static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { char *p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: return *((natural_width *)p); case VMCS_FIELD_WIDTH_U16: return *((u16 *)p); case VMCS_FIELD_WIDTH_U32: return *((u32 *)p); case VMCS_FIELD_WIDTH_U64: return *((u64 *)p); default: WARN_ON_ONCE(1); return -1; } } static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset, u64 field_value) { char *p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; break; case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; break; case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; break; case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; break; default: WARN_ON_ONCE(1); break; } } #endif /* __KVM_X86_VMX_VMCS12_H */ |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ struct xfs_btree_cur; struct xfs_defer_op_type; struct xfs_defer_capture; /* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done * item. */ struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ unsigned int dfp_flags; }; /* * Create a log intent item for this deferred item, but don't actually finish * the work. Caller must clear this before the final transaction commit. */ #define XFS_DEFER_PAUSED (1U << 0) #define XFS_DEFER_PENDING_STRINGS \ { XFS_DEFER_PAUSED, "paused" } void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { const char *name; unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); struct xfs_log_item *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count); int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done, struct list_head *item, struct xfs_btree_cur **state); void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); int (*recover_work)(struct xfs_defer_pending *dfp, struct list_head *capture_list); struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, struct xfs_log_item *intent, struct xfs_log_item *done_item); }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; /* * Deferred operation item relogging limits. */ /* * Rename w/ parent pointers can require up to 5 inodes with deferred ops to * be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip. * These inodes are locked in sorted order by their inode numbers */ #define XFS_DEFER_OPS_NR_INODES 5 #define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ /* Resources that must be held across a transaction roll. */ struct xfs_defer_resources { /* held buffers */ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS]; /* inodes with no unlock flags */ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES]; /* number of held buffers */ unsigned short dr_bufs; /* bitmap of ordered buffers */ unsigned short dr_ordered; /* number of held inodes */ unsigned short dr_inos; }; /* * This structure enables a dfops user to detach the chain of deferred * operations from a transaction so that they can be continued later. */ struct xfs_defer_capture { /* List of other capture structures. */ struct list_head dfc_list; /* Deferred ops state saved from the transaction. */ struct list_head dfc_dfops; unsigned int dfc_tpflags; /* Block reservations for the data and rt devices. */ unsigned int dfc_blkres; unsigned int dfc_rtxres; /* Log reservation saved from the transaction. */ unsigned int dfc_logres; struct xfs_defer_resources dfc_held; }; /* * Functions to capture a chain of deferred operations and continue them later. * This doesn't normally happen except log recovery. */ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); void xfs_defer_start_recovery(struct xfs_log_item *lip, struct list_head *r_dfops, const struct xfs_defer_op_type *ops); void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); int xfs_defer_finish_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct list_head *capture_list); static inline void xfs_defer_add_item( struct xfs_defer_pending *dfp, struct list_head *work) { list_add_tail(work, &dfp->dfp_work); dfp->dfp_count++; } int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); void xfs_defer_add_barrier(struct xfs_trans *tp); #endif /* __XFS_DEFER_H__ */ |
12 12 55875 55871 55858 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", [ ESTACK_VC ] = "#VC", [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_TASK) return "TASK"; if (type == STACK_TYPE_IRQ) return "IRQ"; if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we * use for all the kernel entry points, including * SYSENTER. */ return "ENTRY_TRAMPOLINE"; } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return NULL; } /** * struct estack_pages - Page descriptor for exception stacks * @offs: Offset from the start of the exception stack area * @size: Size of the exception stack * @type: Type to store in the stack_info struct */ struct estack_pages { u32 offs; u16 size; u16 type; }; #define EPAGERANGE(st) \ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ .offs = CEA_ESTACK_OFFS(st), \ .size = CEA_ESTACK_SIZE(st), \ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } /* * Array of exception stack page descriptors. If the stack is larger than * PAGE_SIZE, all pages covering a particular stack will have the same * info. The guard pages including the not mapped DB2 stack are zeroed * out. */ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), EPAGERANGE(VC), EPAGERANGE(VC2), }; static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* * Handle the case where stack trace is collected _before_ * cea_exception_stacks had been initialized. */ if (!begin) return false; end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) return false; /* Calc page offset from start of exception stacks */ k = (stk - begin) >> PAGE_SHIFT; /* Lookup the page descriptor */ ep = &estack_pages[k]; /* Guard page? */ if (!ep->size) return false; begin += (unsigned long)ep->offs; end = begin + (unsigned long)ep->size; regs = (struct pt_regs *)end - 1; info->type = ep->type; info->begin = (unsigned long *)begin; info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; } static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* * @end points directly to the top most stack entry to avoid a -8 * adjustment in the stack switch hotpath. Adjust it back before * calculating @begin. */ end++; begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * Due to the switching logic RSP can never be == @end because the * final operation is 'popq %rsp' which means after that RSP points * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; info->begin = begin; info->end = end; /* * The next stack pointer is stored at the top of the irq stack * before switching to the irq stack. Actual stack entries are all * below that. */ info->next_sp = (unsigned long *)*(end - 1); return true; } bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info) { if (in_task_stack(stack, task, info)) return true; if (task != current) return false; if (in_exception_stack(stack, info)) return true; if (in_irq_stack(stack, info)) return true; if (in_entry_stack(stack, info)) return true; return false; } int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { task = task ? : current; if (!stack) goto unknown; if (!get_stack_info_noinstr(stack, task, info)) goto unknown; /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: * just break out and report an unknown stack type. */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { if (task == current) printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; } return 0; unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; } |
71 64 2 15 15 3 61 73 118 113 13 92 9 88 4 2 3 3 4 4 4 7 5 108 97 14 46 1 1 45 44 29 5 3 11 9 3 4 4 4 11 9 5 2 1 1 1 12 8 4 7 3 4 35 1 3 31 34 34 4 2 2 46 1 3 1 4 1 3 33 12 2 1 3 6 175 153 6 10 1 5 4 14 31 39 33 7 39 33 2 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/device.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/devpts_fs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/compat.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do {} while (0) #endif #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; static struct tty_driver *pts_driver; static DEFINE_MUTEX(devpts_mutex); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { if (tty_io_error(tty)) return; if (tty->count > 2) return; } set_bit(TTY_IO_ERROR, &tty->flags); wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); spin_lock_irq(&tty->ctrl.lock); tty->ctrl.packet = false; spin_unlock_irq(&tty->ctrl.lock); /* Review - krefs on tty_link ?? */ if (!tty->link) return; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); if (tty->link->driver_data) devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif tty_vhangup(tty->link); } } /* * The unthrottle routine is called by the line discipline to signal * that it can receive more characters. For PTY's, the TTY_THROTTLED * flag is always set, to force the line discipline to always call the * unthrottle routine when there are fewer than TTY_THRESHOLD_UNTHROTTLE * characters in the queue. This is necessary since each time this * happens, we need to wake up any sleeping processes that could be * (1) trying to send data to the pty, or (2) waiting in wait_until_sent() * for the pty buffer to be drained. */ static void pty_unthrottle(struct tty_struct *tty) { tty_wakeup(tty->link); set_bit(TTY_THROTTLED, &tty->flags); } /** * pty_write - write to a pty * @tty: the tty we write from * @buf: kernel buffer of data * @c: bytes to write * * Our "hardware" write method. Data is coming from the ldisc which * may be in a non sleeping state. We simply throw this at the other * end of the link as if we were an IRQ handler receiving stuff for * the other side of the pty/tty pair. */ static ssize_t pty_write(struct tty_struct *tty, const u8 *buf, size_t c) { struct tty_struct *to = tty->link; if (tty->flow.stopped || !c) return 0; return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** * pty_write_room - write space * @tty: tty we are writing from * * Report how many bytes the ldisc can send into the queue for * the other device. */ static unsigned int pty_write_room(struct tty_struct *tty) { if (tty->flow.stopped) return 0; return tty_buffer_space_avail(tty->link->port); } /* Set the lock flag on a pty */ static int pty_set_lock(struct tty_struct *tty, int __user *arg) { int val; if (get_user(val, arg)) return -EFAULT; if (val) set_bit(TTY_PTY_LOCK, &tty->flags); else clear_bit(TTY_PTY_LOCK, &tty->flags); return 0; } static int pty_get_lock(struct tty_struct *tty, int __user *arg) { int locked = test_bit(TTY_PTY_LOCK, &tty->flags); return put_user(locked, arg); } /* Set the packet mode on a pty */ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode; if (get_user(pktmode, arg)) return -EFAULT; spin_lock_irq(&tty->ctrl.lock); if (pktmode) { if (!tty->ctrl.packet) { tty->link->ctrl.pktstatus = 0; smp_mb(); tty->ctrl.packet = true; } } else tty->ctrl.packet = false; spin_unlock_irq(&tty->ctrl.lock); return 0; } /* Get the packet mode of a pty */ static int pty_get_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode = tty->ctrl.packet; return put_user(pktmode, arg); } /* Send a signal to the slave */ static int pty_signal(struct tty_struct *tty, int sig) { struct pid *pgrp; if (sig != SIGINT && sig != SIGQUIT && sig != SIGTSTP) return -EINVAL; if (tty->link) { pgrp = tty_get_pgrp(tty->link); if (pgrp) kill_pgrp(pgrp, sig, 1); put_pid(pgrp); } return 0; } static void pty_flush_buffer(struct tty_struct *tty) { struct tty_struct *to = tty->link; if (!to) return; tty_buffer_flush(to, NULL); if (to->ctrl.packet) { spin_lock_irq(&tty->ctrl.lock); tty->ctrl.pktstatus |= TIOCPKT_FLUSHWRITE; wake_up_interruptible(&to->read_wait); spin_unlock_irq(&tty->ctrl.lock); } } static int pty_open(struct tty_struct *tty, struct file *filp) { if (!tty || !tty->link) return -ENODEV; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) goto out; if (test_bit(TTY_PTY_LOCK, &tty->link->flags)) goto out; if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1) goto out; clear_bit(TTY_IO_ERROR, &tty->flags); clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); return 0; out: set_bit(TTY_IO_ERROR, &tty->flags); return -EIO; } static void pty_set_termios(struct tty_struct *tty, const struct ktermios *old_termios) { /* See if packet mode change of state. */ if (tty->link && tty->link->ctrl.packet) { int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty); int old_flow = ((old_termios->c_iflag & IXON) && (old_termios->c_cc[VSTOP] == '\023') && (old_termios->c_cc[VSTART] == '\021')); int new_flow = (I_IXON(tty) && STOP_CHAR(tty) == '\023' && START_CHAR(tty) == '\021'); if ((old_flow != new_flow) || extproc) { spin_lock_irq(&tty->ctrl.lock); if (old_flow != new_flow) { tty->ctrl.pktstatus &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP); if (new_flow) tty->ctrl.pktstatus |= TIOCPKT_DOSTOP; else tty->ctrl.pktstatus |= TIOCPKT_NOSTOP; } if (extproc) tty->ctrl.pktstatus |= TIOCPKT_IOCTL; spin_unlock_irq(&tty->ctrl.lock); wake_up_interruptible(&tty->link->read_wait); } } tty->termios.c_cflag &= ~(CSIZE | PARENB); tty->termios.c_cflag |= (CS8 | CREAD); } /** * pty_resize - resize event * @tty: tty being resized * @ws: window size being set. * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ static int pty_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp, *rpgrp; struct tty_struct *pty = tty->link; /* For a PTY we need to lock the tty side */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group of both ptys */ pgrp = tty_get_pgrp(tty); rpgrp = tty_get_pgrp(pty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); if (rpgrp != pgrp && rpgrp) kill_pgrp(rpgrp, SIGWINCH, 1); put_pid(pgrp); put_pid(rpgrp); tty->winsize = *ws; pty->winsize = *ws; /* Never used so will go away soon */ done: mutex_unlock(&tty->winsize_mutex); return 0; } /** * pty_start - start() handler * pty_stop - stop() handler * @tty: tty being flow-controlled * * Propagates the TIOCPKT status to the master pty. * * NB: only the master pty can be in packet mode so only the slave * needs start()/stop() handlers */ static void pty_start(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->ctrl.packet) { spin_lock_irqsave(&tty->ctrl.lock, flags); tty->ctrl.pktstatus &= ~TIOCPKT_STOP; tty->ctrl.pktstatus |= TIOCPKT_START; spin_unlock_irqrestore(&tty->ctrl.lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } static void pty_stop(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->ctrl.packet) { spin_lock_irqsave(&tty->ctrl.lock, flags); tty->ctrl.pktstatus &= ~TIOCPKT_START; tty->ctrl.pktstatus |= TIOCPKT_STOP; spin_unlock_irqrestore(&tty->ctrl.lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } /** * pty_common_install - set up the pty pair * @driver: the pty driver * @tty: the tty being instantiated * @legacy: true if this is BSD style * * Perform the initial set up for the tty/pty pair. Called from the * tty layer when the port is first opened. * * Locking: the caller must hold the tty_mutex */ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty, bool legacy) { struct tty_struct *o_tty; struct tty_port *ports[2]; int idx = tty->index; int retval = -ENOMEM; /* Opening the slave first has always returned -EIO */ if (driver->subtype != PTY_TYPE_MASTER) return -EIO; ports[0] = kmalloc(sizeof **ports, GFP_KERNEL); ports[1] = kmalloc(sizeof **ports, GFP_KERNEL); if (!ports[0] || !ports[1]) goto err; if (!try_module_get(driver->other->owner)) { /* This cannot in fact currently happen */ goto err; } o_tty = alloc_tty_struct(driver->other, idx); if (!o_tty) goto err_put_module; tty_set_lock_subclass(o_tty); lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE); if (legacy) { /* We always use new tty termios data so we can do this the easy way .. */ tty_init_termios(tty); tty_init_termios(o_tty); driver->other->ttys[idx] = o_tty; driver->ttys[idx] = tty; } else { memset(&tty->termios_locked, 0, sizeof(tty->termios_locked)); tty->termios = driver->init_termios; memset(&o_tty->termios_locked, 0, sizeof(tty->termios_locked)); o_tty->termios = driver->other->init_termios; } /* * Everything allocated ... set up the o_tty structure. */ tty_driver_kref_get(driver->other); /* Establish the links in both directions */ tty->link = o_tty; o_tty->link = tty; tty_port_init(ports[0]); tty_port_init(ports[1]); tty_buffer_set_limit(ports[0], 8192); tty_buffer_set_limit(ports[1], 8192); o_tty->port = ports[0]; tty->port = ports[1]; o_tty->port->itty = o_tty; tty_buffer_set_lock_subclass(o_tty->port); tty_driver_kref_get(driver); tty->count++; o_tty->count++; return 0; err_put_module: module_put(driver->other->owner); err: kfree(ports[0]); kfree(ports[1]); return retval; } static void pty_cleanup(struct tty_struct *tty) { tty_port_put(tty->port); } /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS static int pty_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, true); } static void pty_remove(struct tty_driver *driver, struct tty_struct *tty) { struct tty_struct *pair = tty->link; driver->ttys[tty->index] = NULL; if (pair) pair->driver->ttys[pair->index] = NULL; } static int pty_bsd_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *) arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); case TIOCGPTN: /* TTY returns ENOTTY, but glibc expects EINVAL here */ return -EINVAL; } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_bsd_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_bsd_ioctl(tty, cmd, (unsigned long)compat_ptr(arg)); } #else #define pty_bsd_compat_ioctl NULL #endif static int legacy_count = CONFIG_LEGACY_PTY_COUNT; /* * not really modular, but the easiest way to keep compat with existing * bootargs behaviour is to continue using module_param here. */ module_param(legacy_count, int, 0); /* * The master side of a pty can do TIOCSPTLCK and thus * has pty_bsd_ioctl. */ static const struct tty_operations master_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_bsd_ioctl, .compat_ioctl = pty_bsd_compat_ioctl, .cleanup = pty_cleanup, .resize = pty_resize, .remove = pty_remove }; static const struct tty_operations slave_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .cleanup = pty_cleanup, .resize = pty_resize, .start = pty_start, .stop = pty_stop, .remove = pty_remove }; static void __init legacy_pty_init(void) { struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); pty_driver->driver_name = "pty_master"; pty_driver->name = "pty"; pty_driver->major = PTY_MASTER_MAJOR; pty_driver->minor_start = 0; pty_driver->type = TTY_DRIVER_TYPE_PTY; pty_driver->subtype = PTY_TYPE_MASTER; pty_driver->init_termios = tty_std_termios; pty_driver->init_termios.c_iflag = 0; pty_driver->init_termios.c_oflag = 0; pty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_driver->init_termios.c_lflag = 0; pty_driver->init_termios.c_ispeed = 38400; pty_driver->init_termios.c_ospeed = 38400; pty_driver->other = pty_slave_driver; tty_set_operations(pty_driver, &master_pty_ops_bsd); pty_slave_driver->driver_name = "pty_slave"; pty_slave_driver->name = "ttyp"; pty_slave_driver->major = PTY_SLAVE_MAJOR; pty_slave_driver->minor_start = 0; pty_slave_driver->type = TTY_DRIVER_TYPE_PTY; pty_slave_driver->subtype = PTY_TYPE_SLAVE; pty_slave_driver->init_termios = tty_std_termios; pty_slave_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_slave_driver->init_termios.c_ispeed = 38400; pty_slave_driver->init_termios.c_ospeed = 38400; pty_slave_driver->other = pty_driver; tty_set_operations(pty_slave_driver, &slave_pty_ops_bsd); if (tty_register_driver(pty_driver)) panic("Couldn't register pty driver"); if (tty_register_driver(pty_slave_driver)) panic("Couldn't register pty slave driver"); } #else static inline void legacy_pty_init(void) { } #endif /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node * @tty: the master of the pty being opened * @flags: the flags for open * * Provide a race free way for userspace to open the slave end of a pty * (where they have the master fd and cannot access or trust the mount * namespace /dev/pts was mounted inside). */ int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { int fd; struct file *filp; int retval = -EINVAL; struct path path; if (tty->driver != ptm_driver) return -EIO; fd = get_unused_fd_flags(flags); if (fd < 0) { retval = fd; goto err; } /* Compute the slave's path */ path.mnt = devpts_mntget(master, tty->driver_data); if (IS_ERR(path.mnt)) { retval = PTR_ERR(path.mnt); goto err_put; } path.dentry = tty->link->driver_data; filp = dentry_open(&path, flags, current_cred()); mntput(path.mnt); if (IS_ERR(filp)) { retval = PTR_ERR(filp); goto err_put; } fd_install(fd, filp); return fd; err_put: put_unused_fd(fd); err: return retval; } static int pty_unix98_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *)arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_unix98_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_unix98_ioctl(tty, cmd, cmd == TIOCSIG ? arg : (unsigned long)compat_ptr(arg)); } #else #define pty_unix98_compat_ioctl NULL #endif /** * ptm_unix98_lookup - find a pty master * @driver: ptm driver * @file: unused * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking. */ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { /* Master must be open via /dev/ptmx */ return ERR_PTR(-EIO); } /** * pts_unix98_lookup - find a pty slave * @driver: pts driver * @file: file pointer to tty * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking for the tty pointer. */ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; mutex_lock(&devpts_mutex); tty = devpts_get_priv(file->f_path.dentry); mutex_unlock(&devpts_mutex); /* Master must be open before slave */ if (!tty) return ERR_PTR(-EIO); return tty; } static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, false); } /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) fsi = tty->driver_data; else fsi = tty->link->driver_data; if (fsi) { devpts_kill_index(fsi, tty->index); devpts_release(fsi); } } static void pty_show_fdinfo(struct tty_struct *tty, struct seq_file *m) { seq_printf(m, "tty-index:\t%d\n", tty->index); } static const struct tty_operations ptm_unix98_ops = { .lookup = ptm_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_unix98_ioctl, .compat_ioctl = pty_unix98_compat_ioctl, .resize = pty_resize, .cleanup = pty_cleanup, .show_fdinfo = pty_show_fdinfo, }; static const struct tty_operations pty_unix98_ops = { .lookup = pts_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .start = pty_start, .stop = pty_stop, .cleanup = pty_cleanup, }; /** * ptmx_open - open a unix 98 pty master * @inode: inode of device file * @filp: file pointer to tty * * Allocate a unix98 pty master device from the ptmx driver. * * Locking: tty_mutex protects the init_dev work. tty->count should * protect the rest. * allocated_ptys_lock handles the list of free pty numbers */ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; struct dentry *dentry; int retval; int index; nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ filp->f_mode |= FMODE_NONOTIFY; retval = tty_alloc_file(filp); if (retval) return retval; fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; } /* find a device that is not in use. */ mutex_lock(&devpts_mutex); index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); retval = index; if (index < 0) goto out_put_fsi; mutex_lock(&tty_mutex); tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); retval = PTR_ERR(tty); if (IS_ERR(tty)) goto out; /* * From here on out, the tty is "live", and the index and * fsi will be killed/put by the tty_release() */ set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ tty->driver_data = fsi; tty_add_file(tty, filp); dentry = devpts_pty_new(fsi, index, tty->link); if (IS_ERR(dentry)) { retval = PTR_ERR(dentry); goto err_release; } tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; err_release: tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); return retval; out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); out_free_file: tty_free_file(filp); return retval; } static struct file_operations ptmx_fops __ro_after_init; static void __init unix98_pty_init(void) { ptm_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); ptm_driver->driver_name = "pty_master"; ptm_driver->name = "ptm"; ptm_driver->major = UNIX98_PTY_MASTER_MAJOR; ptm_driver->minor_start = 0; ptm_driver->type = TTY_DRIVER_TYPE_PTY; ptm_driver->subtype = PTY_TYPE_MASTER; ptm_driver->init_termios = tty_std_termios; ptm_driver->init_termios.c_iflag = 0; ptm_driver->init_termios.c_oflag = 0; ptm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; ptm_driver->init_termios.c_lflag = 0; ptm_driver->init_termios.c_ispeed = 38400; ptm_driver->init_termios.c_ospeed = 38400; ptm_driver->other = pts_driver; tty_set_operations(ptm_driver, &ptm_unix98_ops); pts_driver->driver_name = "pty_slave"; pts_driver->name = "pts"; pts_driver->major = UNIX98_PTY_SLAVE_MAJOR; pts_driver->minor_start = 0; pts_driver->type = TTY_DRIVER_TYPE_PTY; pts_driver->subtype = PTY_TYPE_SLAVE; pts_driver->init_termios = tty_std_termios; pts_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pts_driver->init_termios.c_ispeed = 38400; pts_driver->init_termios.c_ospeed = 38400; pts_driver->other = ptm_driver; tty_set_operations(pts_driver, &pty_unix98_ops); if (tty_register_driver(ptm_driver)) panic("Couldn't register Unix98 ptm driver"); if (tty_register_driver(pts_driver)) panic("Couldn't register Unix98 pts driver"); /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; cdev_init(&ptmx_cdev, &ptmx_fops); if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); } #else static inline void unix98_pty_init(void) { } #endif static int __init pty_init(void) { legacy_pty_init(); unix98_pty_init(); return 0; } device_initcall(pty_init); |
1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Mirics MSi001 silicon tuner driver * * Copyright (C) 2013 Antti Palosaari <crope@iki.fi> * Copyright (C) 2014 Antti Palosaari <crope@iki.fi> */ #include <linux/module.h> #include <linux/gcd.h> #include <media/v4l2-device.h> #include <media/v4l2-ctrls.h> static const struct v4l2_frequency_band bands[] = { { .type = V4L2_TUNER_RF, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 49000000, .rangehigh = 263000000, }, { .type = V4L2_TUNER_RF, .index = 1, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 390000000, .rangehigh = 960000000, }, }; struct msi001_dev { struct spi_device *spi; struct v4l2_subdev sd; /* Controls */ struct v4l2_ctrl_handler hdl; struct v4l2_ctrl *bandwidth_auto; struct v4l2_ctrl *bandwidth; struct v4l2_ctrl *lna_gain; struct v4l2_ctrl *mixer_gain; struct v4l2_ctrl *if_gain; unsigned int f_tuner; }; static inline struct msi001_dev *sd_to_msi001_dev(struct v4l2_subdev *sd) { return container_of(sd, struct msi001_dev, sd); } static int msi001_wreg(struct msi001_dev *dev, u32 data) { /* Register format: 4 bits addr + 20 bits value */ return spi_write(dev->spi, &data, 3); }; static int msi001_set_gain(struct msi001_dev *dev, int lna_gain, int mixer_gain, int if_gain) { struct spi_device *spi = dev->spi; int ret; u32 reg; dev_dbg(&spi->dev, "lna=%d mixer=%d if=%d\n", lna_gain, mixer_gain, if_gain); reg = 1 << 0; reg |= (59 - if_gain) << 4; reg |= 0 << 10; reg |= (1 - mixer_gain) << 12; reg |= (1 - lna_gain) << 13; reg |= 4 << 14; reg |= 0 << 17; ret = msi001_wreg(dev, reg); if (ret) goto err; return 0; err: dev_dbg(&spi->dev, "failed %d\n", ret); return ret; }; static int msi001_set_tuner(struct msi001_dev *dev) { struct spi_device *spi = dev->spi; int ret, i; unsigned int uitmp, div_n, k, k_thresh, k_frac, div_lo, f_if1; u32 reg; u64 f_vco; u8 mode, filter_mode; static const struct { u32 rf; u8 mode; u8 div_lo; } band_lut[] = { { 50000000, 0xe1, 16}, /* AM_MODE2, antenna 2 */ {108000000, 0x42, 32}, /* VHF_MODE */ {330000000, 0x44, 16}, /* B3_MODE */ {960000000, 0x48, 4}, /* B45_MODE */ { ~0U, 0x50, 2}, /* BL_MODE */ }; static const struct { u32 freq; u8 filter_mode; } if_freq_lut[] = { { 0, 0x03}, /* Zero IF */ { 450000, 0x02}, /* 450 kHz IF */ {1620000, 0x01}, /* 1.62 MHz IF */ {2048000, 0x00}, /* 2.048 MHz IF */ }; static const struct { u32 freq; u8 val; } bandwidth_lut[] = { { 200000, 0x00}, /* 200 kHz */ { 300000, 0x01}, /* 300 kHz */ { 600000, 0x02}, /* 600 kHz */ {1536000, 0x03}, /* 1.536 MHz */ {5000000, 0x04}, /* 5 MHz */ {6000000, 0x05}, /* 6 MHz */ {7000000, 0x06}, /* 7 MHz */ {8000000, 0x07}, /* 8 MHz */ }; unsigned int f_rf = dev->f_tuner; /* * bandwidth (Hz) * 200000, 300000, 600000, 1536000, 5000000, 6000000, 7000000, 8000000 */ unsigned int bandwidth; /* * intermediate frequency (Hz) * 0, 450000, 1620000, 2048000 */ unsigned int f_if = 0; #define F_REF 24000000 #define DIV_PRE_N 4 #define F_VCO_STEP div_lo dev_dbg(&spi->dev, "f_rf=%d f_if=%d\n", f_rf, f_if); for (i = 0; i < ARRAY_SIZE(band_lut); i++) { if (f_rf <= band_lut[i].rf) { mode = band_lut[i].mode; div_lo = band_lut[i].div_lo; break; } } if (i == ARRAY_SIZE(band_lut)) { ret = -EINVAL; goto err; } /* AM_MODE is upconverted */ if ((mode >> 0) & 0x1) f_if1 = 5 * F_REF; else f_if1 = 0; for (i = 0; i < ARRAY_SIZE(if_freq_lut); i++) { if (f_if == if_freq_lut[i].freq) { filter_mode = if_freq_lut[i].filter_mode; break; } } if (i == ARRAY_SIZE(if_freq_lut)) { ret = -EINVAL; goto err; } /* filters */ bandwidth = dev->bandwidth->val; bandwidth = clamp(bandwidth, 200000U, 8000000U); for (i = 0; i < ARRAY_SIZE(bandwidth_lut); i++) { if (bandwidth <= bandwidth_lut[i].freq) { bandwidth = bandwidth_lut[i].val; break; } } if (i == ARRAY_SIZE(bandwidth_lut)) { ret = -EINVAL; goto err; } dev->bandwidth->val = bandwidth_lut[i].freq; dev_dbg(&spi->dev, "bandwidth selected=%d\n", bandwidth_lut[i].freq); /* * Fractional-N synthesizer * * +---------------------------------------+ * v | * Fref +----+ +-------+ +----+ +------+ +---+ * ------> | PD | --> | VCO | ------> | /4 | --> | /N.F | <-- | K | * +----+ +-------+ +----+ +------+ +---+ * | * | * v * +-------+ Fout * | /Rout | ------> * +-------+ */ /* Calculate PLL integer and fractional control word. */ f_vco = (u64) (f_rf + f_if + f_if1) * div_lo; div_n = div_u64_rem(f_vco, DIV_PRE_N * F_REF, &k); k_thresh = (DIV_PRE_N * F_REF) / F_VCO_STEP; k_frac = div_u64((u64) k * k_thresh, (DIV_PRE_N * F_REF)); /* Find out greatest common divisor and divide to smaller. */ uitmp = gcd(k_thresh, k_frac); k_thresh /= uitmp; k_frac /= uitmp; /* Force divide to reg max. Resolution will be reduced. */ uitmp = DIV_ROUND_UP(k_thresh, 4095); k_thresh = DIV_ROUND_CLOSEST(k_thresh, uitmp); k_frac = DIV_ROUND_CLOSEST(k_frac, uitmp); /* Calculate real RF set. */ uitmp = (unsigned int) F_REF * DIV_PRE_N * div_n; uitmp += (unsigned int) F_REF * DIV_PRE_N * k_frac / k_thresh; uitmp /= div_lo; dev_dbg(&spi->dev, "f_rf=%u:%u f_vco=%llu div_n=%u k_thresh=%u k_frac=%u div_lo=%u\n", f_rf, uitmp, f_vco, div_n, k_thresh, k_frac, div_lo); ret = msi001_wreg(dev, 0x00000e); if (ret) goto err; ret = msi001_wreg(dev, 0x000003); if (ret) goto err; reg = 0 << 0; reg |= mode << 4; reg |= filter_mode << 12; reg |= bandwidth << 14; reg |= 0x02 << 17; reg |= 0x00 << 20; ret = msi001_wreg(dev, reg); if (ret) goto err; reg = 5 << 0; reg |= k_thresh << 4; reg |= 1 << 19; reg |= 1 << 21; ret = msi001_wreg(dev, reg); if (ret) goto err; reg = 2 << 0; reg |= k_frac << 4; reg |= div_n << 16; ret = msi001_wreg(dev, reg); if (ret) goto err; ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->cur.val, dev->if_gain->cur.val); if (ret) goto err; reg = 6 << 0; reg |= 63 << 4; reg |= 4095 << 10; ret = msi001_wreg(dev, reg); if (ret) goto err; return 0; err: dev_dbg(&spi->dev, "failed %d\n", ret); return ret; } static int msi001_standby(struct v4l2_subdev *sd) { struct msi001_dev *dev = sd_to_msi001_dev(sd); return msi001_wreg(dev, 0x000000); } static int msi001_g_tuner(struct v4l2_subdev *sd, struct v4l2_tuner *v) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "index=%d\n", v->index); strscpy(v->name, "Mirics MSi001", sizeof(v->name)); v->type = V4L2_TUNER_RF; v->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; v->rangelow = 49000000; v->rangehigh = 960000000; return 0; } static int msi001_s_tuner(struct v4l2_subdev *sd, const struct v4l2_tuner *v) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "index=%d\n", v->index); return 0; } static int msi001_g_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *f) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "tuner=%d\n", f->tuner); f->frequency = dev->f_tuner; return 0; } static int msi001_s_frequency(struct v4l2_subdev *sd, const struct v4l2_frequency *f) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; unsigned int band; dev_dbg(&spi->dev, "tuner=%d type=%d frequency=%u\n", f->tuner, f->type, f->frequency); if (f->frequency < ((bands[0].rangehigh + bands[1].rangelow) / 2)) band = 0; else band = 1; dev->f_tuner = clamp_t(unsigned int, f->frequency, bands[band].rangelow, bands[band].rangehigh); return msi001_set_tuner(dev); } static int msi001_enum_freq_bands(struct v4l2_subdev *sd, struct v4l2_frequency_band *band) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "tuner=%d type=%d index=%d\n", band->tuner, band->type, band->index); if (band->index >= ARRAY_SIZE(bands)) return -EINVAL; band->capability = bands[band->index].capability; band->rangelow = bands[band->index].rangelow; band->rangehigh = bands[band->index].rangehigh; return 0; } static const struct v4l2_subdev_tuner_ops msi001_tuner_ops = { .standby = msi001_standby, .g_tuner = msi001_g_tuner, .s_tuner = msi001_s_tuner, .g_frequency = msi001_g_frequency, .s_frequency = msi001_s_frequency, .enum_freq_bands = msi001_enum_freq_bands, }; static const struct v4l2_subdev_ops msi001_ops = { .tuner = &msi001_tuner_ops, }; static int msi001_s_ctrl(struct v4l2_ctrl *ctrl) { struct msi001_dev *dev = container_of(ctrl->handler, struct msi001_dev, hdl); struct spi_device *spi = dev->spi; int ret; dev_dbg(&spi->dev, "id=%d name=%s val=%d min=%lld max=%lld step=%lld\n", ctrl->id, ctrl->name, ctrl->val, ctrl->minimum, ctrl->maximum, ctrl->step); switch (ctrl->id) { case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH: ret = msi001_set_tuner(dev); break; case V4L2_CID_RF_TUNER_LNA_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->val, dev->mixer_gain->cur.val, dev->if_gain->cur.val); break; case V4L2_CID_RF_TUNER_MIXER_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->val, dev->if_gain->cur.val); break; case V4L2_CID_RF_TUNER_IF_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->cur.val, dev->if_gain->val); break; default: dev_dbg(&spi->dev, "unknown control %d\n", ctrl->id); ret = -EINVAL; } return ret; } static const struct v4l2_ctrl_ops msi001_ctrl_ops = { .s_ctrl = msi001_s_ctrl, }; static int msi001_probe(struct spi_device *spi) { struct msi001_dev *dev; int ret; dev_dbg(&spi->dev, "\n"); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { ret = -ENOMEM; goto err; } dev->spi = spi; dev->f_tuner = bands[0].rangelow; v4l2_spi_subdev_init(&dev->sd, spi, &msi001_ops); /* Register controls */ v4l2_ctrl_handler_init(&dev->hdl, 5); dev->bandwidth_auto = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_BANDWIDTH_AUTO, 0, 1, 1, 1); dev->bandwidth = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_BANDWIDTH, 200000, 8000000, 1, 200000); if (dev->hdl.error) { ret = dev->hdl.error; dev_err(&spi->dev, "Could not initialize controls\n"); /* control init failed, free handler */ goto err_ctrl_handler_free; } v4l2_ctrl_auto_cluster(2, &dev->bandwidth_auto, 0, false); dev->lna_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_LNA_GAIN, 0, 1, 1, 1); dev->mixer_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_MIXER_GAIN, 0, 1, 1, 1); dev->if_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_IF_GAIN, 0, 59, 1, 0); if (dev->hdl.error) { ret = dev->hdl.error; dev_err(&spi->dev, "Could not initialize controls\n"); /* control init failed, free handler */ goto err_ctrl_handler_free; } dev->sd.ctrl_handler = &dev->hdl; return 0; err_ctrl_handler_free: v4l2_ctrl_handler_free(&dev->hdl); kfree(dev); err: return ret; } static void msi001_remove(struct spi_device *spi) { struct v4l2_subdev *sd = spi_get_drvdata(spi); struct msi001_dev *dev = sd_to_msi001_dev(sd); dev_dbg(&spi->dev, "\n"); /* * Registered by v4l2_spi_new_subdev() from master driver, but we must * unregister it from here. Weird. */ v4l2_device_unregister_subdev(&dev->sd); v4l2_ctrl_handler_free(&dev->hdl); kfree(dev); } static const struct spi_device_id msi001_id_table[] = { {"msi001", 0}, {} }; MODULE_DEVICE_TABLE(spi, msi001_id_table); static struct spi_driver msi001_driver = { .driver = { .name = "msi001", .suppress_bind_attrs = true, }, .probe = msi001_probe, .remove = msi001_remove, .id_table = msi001_id_table, }; module_spi_driver(msi001_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("Mirics MSi001"); MODULE_LICENSE("GPL"); |
12 12 12 2 1 6 5 17 11 6 4 8 8 16 4 1 1 1 1 5 1 18 18 30 30 30 18 1 3 2 1 2 18 18 22 22 5 18 22 21 1 19 1 15 7 36 1 7 2 1 6 22 29 14 6 2 8 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 | // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/inotify_user.c - inotify support for userspace * * Authors: * John McCutchan <ttb@tentacle.dhs.org> * Robert Love <rml@novell.com> * * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * inotify was largely rewriten to make use of the fsnotify infrastructure */ #include <linux/file.h> #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> #include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ #include <linux/sched/signal.h> #include <linux/slab.h> /* struct kmem_cache */ #include <linux/syscalls.h> #include <linux/types.h> #include <linux/anon_inodes.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/wait.h> #include <linux/memcontrol.h> #include <linux/security.h> #include "inotify.h" #include "../fdinfo.h" #include <asm/ioctls.h> /* * An inotify watch requires allocating an inotify_inode_mark structure as * well as pinning the watched inode. Doubling the size of a VFS inode * should be more than enough to cover the additional filesystem inode * size increase. */ #define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \ 2 * sizeof(struct inode)) /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long it_zero = 0; static long it_int_max = INT_MAX; static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &it_zero, .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &it_zero, .extra2 = &it_int_max, }, { .procname = "max_queued_events", .data = &inotify_max_queued_events, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, }; static void __init inotify_sysctls_init(void) { register_sysctl("fs/inotify", inotify_table); } #else #define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) { __u32 mask; /* * Everything should receive events when the inode is unmounted. * All directories care about children. */ mask = (FS_UNMOUNT); if (S_ISDIR(inode->i_mode)) mask |= FS_EVENT_ON_CHILD; /* mask off the flags used to open the fd */ mask |= (arg & INOTIFY_USER_MASK); return mask; } #define INOTIFY_MARK_FLAGS \ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT) static inline unsigned int inotify_arg_to_flags(u32 arg) { unsigned int flags = 0; if (arg & IN_EXCL_UNLINK) flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK; if (arg & IN_ONESHOT) flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT; return flags; } static inline u32 inotify_mask_to_arg(__u32 mask) { return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | IN_Q_OVERFLOW); } /* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = EPOLLIN | EPOLLRDNORM; spin_unlock(&group->notification_lock); return ret; } static int round_event_name_len(struct fsnotify_event *fsn_event) { struct inotify_event_info *event; event = INOTIFY_E(fsn_event); if (!event->name_len) return 0; return roundup(event->name_len + 1, sizeof(struct inotify_event)); } /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if * not large enough. * * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size = sizeof(struct inotify_event); struct fsnotify_event *event; event = fsnotify_peek_first_event(group); if (!event) return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); return event; } /* * Copy an event to user space, returning how much we copied. * * We already checked that the event size is smaller than the * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); size_t name_len; size_t pad_name_len; pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); event = INOTIFY_E(fsn_event); name_len = event->name_len; /* * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ if (copy_to_user(buf, &inotify_event, event_size)) return -EFAULT; buf += event_size; /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) * with zeros. */ if (pad_name_len) { /* copy the path name */ if (copy_to_user(buf, event->name, name_len)) return -EFAULT; buf += name_len; /* fill userspace with 0's */ if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; event_size += pad_name_len; } return event_size; } static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; struct fsnotify_event *kevent; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; add_wait_queue(&group->notification_waitq, &wait); while (1) { spin_lock(&group->notification_lock); kevent = get_one_event(group, count); spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); if (kevent) { ret = PTR_ERR(kevent); if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; count -= ret; continue; } ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; if (start != buf) break; wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; } static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); return 0; } static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; group = file->private_data; p = (void __user *) arg; pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd); switch (cmd) { case FIONREAD: spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; #ifdef CONFIG_CHECKPOINT_RESTORE case INOTIFY_IOC_SETNEXTWD: ret = -EINVAL; if (arg >= 1 && arg <= INT_MAX) { struct inotify_group_private_data *data; data = &group->inotify_data; spin_lock(&data->idr_lock); idr_set_cursor(&data->idr, (unsigned int)arg); spin_unlock(&data->idr_lock); ret = 0; } break; #endif /* CONFIG_CHECKPOINT_RESTORE */ } return ret; } static const struct file_operations inotify_fops = { .show_fdinfo = inotify_show_fdinfo, .poll = inotify_poll, .read = inotify_read, .fasync = fsnotify_fasync, .release = inotify_release, .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, .llseek = noop_llseek, }; /* * find_inode - resolve a user-given path to a specific inode */ static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned int flags, __u64 mask) { int error; error = user_path_at(AT_FDCWD, dirname, flags, path); if (error) return error; /* you can only watch an inode if you have read permissions on it */ error = path_permission(path, MAY_READ); if (error) { path_put(path); return error; } error = security_path_notify(path, mask, FSNOTIFY_OBJ_TYPE_INODE); if (error) path_put(path); return error; } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, struct inotify_inode_mark *i_mark) { int ret; idr_preload(GFP_KERNEL); spin_lock(idr_lock); ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; fsnotify_get_mark(&i_mark->fsn_mark); } spin_unlock(idr_lock); idr_preload_end(); return ret < 0 ? ret : 0; } static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, int wd) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *i_mark; assert_spin_locked(idr_lock); i_mark = idr_find(idr, wd); if (i_mark) { struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark; fsnotify_get_mark(fsn_mark); /* One ref for being in the idr, one ref we just took */ BUG_ON(refcount_read(&fsn_mark->refcnt) < 2); } return i_mark; } static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, int wd) { struct inotify_inode_mark *i_mark; spinlock_t *idr_lock = &group->inotify_data.idr_lock; spin_lock(idr_lock); i_mark = inotify_idr_find_locked(group, wd); spin_unlock(idr_lock); return i_mark; } /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. */ static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; spin_lock(idr_lock); wd = i_mark->wd; /* * does this i_mark think it is in the idr? we shouldn't get called * if it wasn't.... */ if (wd == -1) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* * We found an mark in the idr at the right wd, but it's * not the mark we were told to remove. eparis seriously * fucked up somewhere. */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " "found_i_mark=%p found_i_mark->wd=%d " "found_i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, found_i_mark, found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr * one ref grabbed by inotify_idr_find */ if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } idr_remove(idr, wd); /* Removed from the idr, drop that ref. */ fsnotify_put_mark(&i_mark->fsn_mark); out: i_mark->wd = -1; spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); } /* * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; /* Queue ignore event for the watch */ inotify_handle_inode_event(fsn_mark, FS_IN_IGNORED, NULL, NULL, NULL, 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); dec_inotify_watches(group->inotify_data.ucounts); } static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; __u32 old_mask, new_mask; int replace = !(arg & IN_MASK_ADD); int create = (arg & IN_MASK_CREATE); int ret; fsn_mark = fsnotify_find_inode_mark(inode, group); if (!fsn_mark) return -ENOENT; else if (create) { ret = -EEXIST; goto out; } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); old_mask = fsn_mark->mask; if (replace) { fsn_mark->mask = 0; fsn_mark->flags &= ~INOTIFY_MARK_FLAGS; } fsn_mark->mask |= inotify_arg_to_mask(inode, arg); fsn_mark->flags |= inotify_arg_to_flags(arg); new_mask = fsn_mark->mask; spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { /* more bits in old than in new? */ int dropped = (old_mask & ~new_mask); /* more bits in this fsn_mark than the inode's mask? */ int do_inode = (new_mask & ~inode->i_fsnotify_mask); /* update the inode with this new fsn_mark */ if (dropped || do_inode) fsnotify_recalc_mask(inode->i_fsnotify_marks); } /* return the wd */ ret = i_mark->wd; out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); return ret; } static int inotify_new_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct inotify_inode_mark *tmp_i_mark; int ret; struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) return -ENOMEM; fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg); tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg); tmp_i_mark->wd = -1; ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; /* increment the number of watches the user has */ if (!inc_inotify_watches(group->inotify_data.ucounts)) { inotify_remove_from_idr(group, tmp_i_mark); ret = -ENOSPC; goto out_err; } /* we are on the idr, now get on the inode */ ret = fsnotify_add_inode_mark_locked(&tmp_i_mark->fsn_mark, inode, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); goto out_err; } /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; out_err: /* match the ref from fsnotify_init_mark() */ fsnotify_put_mark(&tmp_i_mark->fsn_mark); return ret; } static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { int ret = 0; fsnotify_group_lock(group); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); fsnotify_group_unlock(group); return ret; } static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops, FSNOTIFY_GROUP_USER); if (IS_ERR(group)) return group; oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT); if (unlikely(!oevent)) { fsnotify_destroy_group(group); return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; fsnotify_init_event(group->overflow_event); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; oevent->name_len = 0; group->max_events = max_events; group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); group->inotify_data.ucounts = inc_ucount(current_user_ns(), current_euid(), UCOUNT_INOTIFY_INSTANCES); if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } return group; } /* inotify syscalls */ static int do_inotify_init(int flags) { struct fsnotify_group *group; int ret; /* Check the IN_* constants for consistency. */ BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) return -EINVAL; /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ group = inotify_new_group(inotify_max_queued_events); if (IS_ERR(group)) return PTR_ERR(group); ret = anon_inode_getfd("inotify", &inotify_fops, group, O_RDONLY | flags); if (ret < 0) fsnotify_destroy_group(group); return ret; } SYSCALL_DEFINE1(inotify_init1, int, flags) { return do_inotify_init(flags); } SYSCALL_DEFINE0(inotify_init) { return do_inotify_init(0); } SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { struct fsnotify_group *group; struct inode *inode; struct path path; struct fd f; int ret; unsigned flags = 0; /* * We share a lot of code with fs/dnotify. We also share * the bit layout between inotify's IN_* and the fsnotify * FS_*. This check ensures that only the inotify IN_* * bits get passed in and set in watches/events. */ if (unlikely(mask & ~ALL_INOTIFY_BITS)) return -EINVAL; /* * Require at least one valid bit set in the mask. * Without _something_ set, we would have no events to * watch for. */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { ret = -EINVAL; goto fput_and_out; } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { ret = -EINVAL; goto fput_and_out; } if (!(mask & IN_DONT_FOLLOW)) flags |= LOOKUP_FOLLOW; if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; ret = inotify_find_inode(pathname, &path, flags, (mask & IN_ALL_EVENTS)); if (ret) goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); fput_and_out: fdput(f); return ret; } SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; struct fd f; int ret = -EINVAL; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) goto out; group = f.file->private_data; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) goto out; ret = 0; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); out: fdput(f); return ret; } /* * inotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ static int __init inotify_user_setup(void) { unsigned long watches_max; struct sysinfo si; si_meminfo(&si); /* * Allow up to 1% of addressable memory to be allocated for inotify * watches (per user) limited to the range [8192, 1048576]. */ watches_max = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / INOTIFY_WATCH_COST; watches_max = clamp(watches_max, 8192UL, 1048576UL); BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(IN_OPEN != FS_OPEN); BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); BUILD_BUG_ON(IN_CREATE != FS_CREATE); BUILD_BUG_ON(IN_DELETE != FS_DELETE); BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; inotify_sysctls_init(); return 0; } fs_initcall(inotify_user_setup); |
1 43 43 19 7 8 9 2 1 1 1 1 2 2 1 1 1 3 2 1 2 1 3 3 3 3 3 1 2 1 1 1 1 24 2 19 3 19 3 18 4 20 2 17 5 17 5 18 4 11 11 22 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 | // SPDX-License-Identifier: GPL-2.0-only /* * This file contains vfs inode ops for the 9P2000.L protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" #include "xattr.h" #include "acl.h" static int v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object * @dir_inode: The directory inode * * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) { BUG_ON(dir_inode == NULL); if (dir_inode->i_mode & S_ISGID) { /* set_gid bit is set.*/ return dir_inode->i_gid; } return current_fsgid(); } struct inode * v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new) { int retval; struct inode *inode; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses = sb->s_fs_info; inode = iget_locked(sb, QID2INO(&fid->qid)); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if (!new) { goto done; } else { /* deal with race condition in inode number reuse */ p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n", inode->i_ino); iput(inode); remove_inode_hash(inode); inode = iget_locked(sb, QID2INO(&fid->qid)); WARN_ON(!(inode->i_state & I_NEW)); } } /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); if (IS_ERR(st)) { retval = PTR_ERR(st); goto error; } retval = v9fs_init_inode(v9ses, inode, &fid->qid, st->st_mode, new_decode_dev(st->st_rdev)); v9fs_stat2inode_dotl(st, inode, 0); kfree(st); if (retval) goto error; v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) goto error; unlock_new_inode(inode); done: return inode; error: iget_failed(inode); return ERR_PTR(retval); } struct dotl_openflag_map { int open_flag; int dotl_flag; }; static int v9fs_mapped_dotl_flags(int flags) { int i; int rflags = 0; struct dotl_openflag_map dotl_oflag_map[] = { { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, { FASYNC, P9_DOTL_FASYNC }, { O_DIRECT, P9_DOTL_DIRECT }, { O_LARGEFILE, P9_DOTL_LARGEFILE }, { O_DIRECTORY, P9_DOTL_DIRECTORY }, { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, { O_NOATIME, P9_DOTL_NOATIME }, { O_CLOEXEC, P9_DOTL_CLOEXEC }, { O_SYNC, P9_DOTL_SYNC}, }; for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { if (flags & dotl_oflag_map[i].open_flag) rflags |= dotl_oflag_map[i].dotl_flag; } return rflags; } /** * v9fs_open_to_dotl_flags- convert Linux specific open flags to * plan 9 open flag. * @flags: flags to convert */ int v9fs_open_to_dotl_flags(int flags) { int rflags = 0; /* * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY * and P9_DOTL_NOACCESS */ rflags |= flags & O_ACCMODE; rflags |= v9fs_mapped_dotl_flags(flags); return rflags; } /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @idmap: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions * @excl: True if the file must not yet exist * */ static int v9fs_vfs_create_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { return v9fs_vfs_mknod_dotl(idmap, dir, dentry, omode, 0); } static int v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int flags, umode_t omode) { int err = 0; kgid_t gid; umode_t mode; int p9_omode = v9fs_open_to_dotl_flags(flags); const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) dentry = res; } /* Only creates */ if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n", name, flags, omode); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto out; } /* clone a fid to use for creation */ ofid = clone_fid(dfid); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n", err); goto out; } if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, creating w/ O_RDWR\n"); } err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n", err); goto out; } v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto out; } /* Now set the ACL based on the default value */ v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) goto out; file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { struct v9fs_inode *v9inode = V9FS_I(inode); fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); } #endif v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags); v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: p9_fid_put(dfid); p9_fid_put(ofid); p9_fid_put(fid); v9fs_put_acl(dacl, pacl); dput(res); return err; } /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @idmap: The idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode) { int err; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; umode_t mode; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) omode |= S_ISGID; dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); if (err < 0) goto error; fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto error; } /* instantiate inode and assign the unopened fid to the dentry */ inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_fid_add(dentry, &fid); v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); err = 0; inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); return err; } static int v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct inode *inode = d_inode(dentry); struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { int retval = filemap_fdatawrite(inode->i_mapping); if (retval) p9_debug(P9_DEBUG_ERROR, "flushing writeback during getattr returned %d\n", retval); } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); /* Ask for all the fields in stat structure. Server will return * whatever it supports */ st = p9_client_getattr_dotl(fid, P9_STATS_ALL); p9_fid_put(fid); if (IS_ERR(st)) return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; kfree(st); return 0; } /* * Attribute flags. */ #define P9_ATTR_MODE (1 << 0) #define P9_ATTR_UID (1 << 1) #define P9_ATTR_GID (1 << 2) #define P9_ATTR_SIZE (1 << 3) #define P9_ATTR_ATIME (1 << 4) #define P9_ATTR_MTIME (1 << 5) #define P9_ATTR_CTIME (1 << 6) #define P9_ATTR_ATIME_SET (1 << 7) #define P9_ATTR_MTIME_SET (1 << 8) struct dotl_iattr_map { int iattr_valid; int p9_iattr_valid; }; static int v9fs_mapped_iattr_valid(int iattr_valid) { int i; int p9_iattr_valid = 0; struct dotl_iattr_map dotl_iattr_map[] = { { ATTR_MODE, P9_ATTR_MODE }, { ATTR_UID, P9_ATTR_UID }, { ATTR_GID, P9_ATTR_GID }, { ATTR_SIZE, P9_ATTR_SIZE }, { ATTR_ATIME, P9_ATTR_ATIME }, { ATTR_MTIME, P9_ATTR_MTIME }, { ATTR_CTIME, P9_ATTR_CTIME }, { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, }; for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { if (iattr_valid & dotl_iattr_map[i].iattr_valid) p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; } return p9_iattr_valid; } /** * v9fs_vfs_setattr_dotl - set file metadata * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct inode *inode = d_inode(dentry); struct v9fs_session_info __maybe_unused *v9ses; struct p9_fid *fid = NULL; struct p9_iattr_dotl p9attr = { .uid = INVALID_UID, .gid = INVALID_GID, }; p9_debug(P9_DEBUG_VFS, "\n"); retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; v9ses = v9fs_dentry2v9ses(dentry); p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); if (iattr->ia_valid & ATTR_MODE) p9attr.mode = iattr->ia_mode; if (iattr->ia_valid & ATTR_UID) p9attr.uid = iattr->ia_uid; if (iattr->ia_valid & ATTR_GID) p9attr.gid = iattr->ia_gid; if (iattr->ia_valid & ATTR_SIZE) p9attr.size = iattr->ia_size; if (iattr->ia_valid & ATTR_ATIME_SET) { p9attr.atime_sec = iattr->ia_atime.tv_sec; p9attr.atime_nsec = iattr->ia_atime.tv_nsec; } if (iattr->ia_valid & ATTR_MTIME_SET) { p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; } if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; WARN_ON(!fid); } if (!fid) { fid = v9fs_fid_lookup(dentry); use_dentry = 1; } if (IS_ERR(fid)) return PTR_ERR(fid); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { retval = filemap_fdatawrite(inode->i_mapping); if (retval < 0) p9_debug(P9_DEBUG_ERROR, "Flushing file prior to setattr failed: %d\n", retval); } retval = p9_client_setattr(fid, &p9attr); if (retval < 0) { if (use_dentry) p9_fid_put(fid); return retval; } if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)), iattr->ia_size); #endif } v9fs_invalidate_inode_attr(inode); setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ retval = v9fs_acl_chmod(inode, fid); if (retval < 0) { if (use_dentry) p9_fid_put(fid); return retval; } } if (use_dentry) p9_fid_put(fid); return 0; } /** * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { inode_set_atime(inode, stat->st_atime_sec, stat->st_atime_nsec); inode_set_mtime(inode, stat->st_mtime_sec, stat->st_mtime_nsec); inode_set_ctime(inode, stat->st_ctime_sec, stat->st_ctime_nsec); inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode_set_atime(inode, stat->st_atime_sec, stat->st_atime_nsec); } if (stat->st_result_mask & P9_STATS_MTIME) { inode_set_mtime(inode, stat->st_mtime_sec, stat->st_mtime_nsec); } if (stat->st_result_mask & P9_STATS_CTIME) { inode_set_ctime(inode, stat->st_ctime_sec, stat->st_ctime_nsec); } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) { v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. */ v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } static int v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err; kgid_t gid; const unsigned char *name; struct p9_qid qid; struct p9_fid *dfid; struct p9_fid *fid = NULL; name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } gid = v9fs_get_fsgid_for_create(dir); /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ err = p9_client_symlink(dfid, name, symname, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } v9fs_invalidate_inode_attr(dir); error: p9_fid_put(fid); p9_fid_put(dfid); return err; } /** * v9fs_vfs_link_dotl - create a hardlink for dotl * @old_dentry: dentry for file to link to * @dir: inode destination for new link * @dentry: dentry for link * */ static int v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) { p9_fid_put(dfid); return PTR_ERR(oldfid); } err = p9_client_link(dfid, oldfid, dentry->d_name.name); p9_fid_put(dfid); p9_fid_put(oldfid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } v9fs_invalidate_inode_attr(dir); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Get the latest stat info from server. */ struct p9_fid *fid; fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); p9_fid_put(fid); } ihold(d_inode(old_dentry)); d_instantiate(dentry, d_inode(old_dentry)); return err; } /** * v9fs_vfs_mknod_dotl - create a special file * @idmap: The idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation * @rdev: device associated with special file * */ static int v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; kgid_t gid; const unsigned char *name; umode_t mode; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); if (err < 0) goto error; v9fs_invalidate_inode_attr(dir); fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto error; } inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); err = 0; error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); return err; } /** * v9fs_vfs_get_link_dotl - follow a symlink path * @dentry: dentry for symlink * @inode: inode for symlink * @done: destructor for return value */ static const char * v9fs_vfs_get_link_dotl(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct p9_fid *fid; char *target; int retval; if (!dentry) return ERR_PTR(-ECHILD); p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); p9_fid_put(fid); if (retval) return ERR_PTR(retval); set_delayed_call(done, kfree_link, target); return target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); /* * Don't update inode if the file type is different */ if (inode_wrong_type(inode, st->st_mode)) goto out; /* * We don't want to refresh inode->i_size, * because we may have cached data */ flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; } const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, .atomic_open = v9fs_vfs_atomic_open_dotl, .lookup = v9fs_vfs_lookup, .link = v9fs_vfs_link_dotl, .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, }; |
678 679 38 641 641 19 4 15 638 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor security identifier (secid) manipulation fns * * Copyright 2009-2017 Canonical Ltd. * * AppArmor allocates a unique secid for every label used. If a label * is replaced it receives the secid of the label it is replacing. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> #include "include/cred.h" #include "include/lib.h" #include "include/secid.h" #include "include/label.h" #include "include/policy_ns.h" /* * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ #define AA_FIRST_SECID 2 static DEFINE_XARRAY_FLAGS(aa_secids, XA_FLAGS_LOCK_IRQ | XA_FLAGS_TRACK_FREE); int apparmor_display_secid_mode; /* * TODO: allow policy to reserve a secid range? * TODO: add secid pinning * TODO: use secid_update in label replace */ /** * aa_secid_update - update a secid mapping to a new label * @secid: secid to update * @label: label the secid will now map to */ void aa_secid_update(u32 secid, struct aa_label *label) { unsigned long flags; xa_lock_irqsave(&aa_secids, flags); __xa_store(&aa_secids, secid, label, 0); xa_unlock_irqrestore(&aa_secids, flags); } /* * see label for inverse aa_label_to_secid */ struct aa_label *aa_secid_to_label(u32 secid) { return xa_load(&aa_secids, secid); } int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { /* TODO: cache secctx and ref count so we don't have to recreate */ struct aa_label *label = aa_secid_to_label(secid); int flags = FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT; int len; AA_BUG(!seclen); if (!label) return -EINVAL; if (apparmor_display_secid_mode) flags |= FLAG_SHOW_MODE; if (secdata) len = aa_label_asxprint(secdata, root_ns, label, flags, GFP_ATOMIC); else len = aa_label_snxprint(NULL, 0, root_ns, label, flags); if (len < 0) return -ENOMEM; *seclen = len; return 0; } int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { struct aa_label *label; label = aa_label_strn_parse(&root_ns->unconfined->label, secdata, seclen, GFP_KERNEL, false, false); if (IS_ERR(label)) return PTR_ERR(label); *secid = label->secid; return 0; } void apparmor_release_secctx(char *secdata, u32 seclen) { kfree(secdata); } /** * aa_alloc_secid - allocate a new secid for a profile * @label: the label to allocate a secid for * @gfp: memory allocation flags * * Returns: 0 with @label->secid initialized * <0 returns error with @label->secid set to AA_SECID_INVALID */ int aa_alloc_secid(struct aa_label *label, gfp_t gfp) { unsigned long flags; int ret; xa_lock_irqsave(&aa_secids, flags); ret = __xa_alloc(&aa_secids, &label->secid, label, XA_LIMIT(AA_FIRST_SECID, INT_MAX), gfp); xa_unlock_irqrestore(&aa_secids, flags); if (ret < 0) { label->secid = AA_SECID_INVALID; return ret; } return 0; } /** * aa_free_secid - free a secid * @secid: secid to free */ void aa_free_secid(u32 secid) { unsigned long flags; xa_lock_irqsave(&aa_secids, flags); __xa_erase(&aa_secids, secid); xa_unlock_irqrestore(&aa_secids, flags); } |
2 2 4 4 4 21 4 2 23 180 182 174 8 187 178 183 177 178 179 177 177 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> #include "devl_internal.h" EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); static struct devlink *devlinks_xa_get(unsigned long index) { struct devlink *devlink; rcu_read_lock(); devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED); if (!devlink || !devlink_try_get(devlink)) devlink = NULL; rcu_read_unlock(); return devlink; } /* devlink_rels xarray contains 1:1 relationships between * devlink object and related nested devlink instance. * The xarray index is used to get the nested object from * the nested-in object code. */ static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1); #define DEVLINK_REL_IN_USE XA_MARK_0 struct devlink_rel { u32 index; refcount_t refcount; u32 devlink_index; struct { u32 devlink_index; u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; struct delayed_work notify_work; } nested_in; }; static void devlink_rel_free(struct devlink_rel *rel) { xa_erase(&devlink_rels, rel->index); kfree(rel); } static void __devlink_rel_get(struct devlink_rel *rel) { refcount_inc(&rel->refcount); } static void __devlink_rel_put(struct devlink_rel *rel) { if (refcount_dec_and_test(&rel->refcount)) devlink_rel_free(rel); } static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, nested_in.notify_work.work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); if (!devlink) goto rel_put; if (!devl_trylock(devlink)) { devlink_put(devlink); goto reschedule_work; } if (!devl_is_registered(devlink)) { devl_unlock(devlink); devlink_put(devlink); goto rel_put; } if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE)) rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index); rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index); devl_unlock(devlink); devlink_put(devlink); rel_put: __devlink_rel_put(rel); return; reschedule_work: schedule_delayed_work(&rel->nested_in.notify_work, 1); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); schedule_delayed_work(&rel->nested_in.notify_work, 0); } static struct devlink_rel *devlink_rel_alloc(void) { struct devlink_rel *rel; static u32 next; int err; rel = kzalloc(sizeof(*rel), GFP_KERNEL); if (!rel) return ERR_PTR(-ENOMEM); err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); if (err) { kfree(rel); return ERR_PTR(err); } refcount_set(&rel->refcount, 1); INIT_DELAYED_WORK(&rel->nested_in.notify_work, &devlink_rel_nested_in_notify_work); return rel; } static void devlink_rel_put(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink_rel_nested_in_notify_work_schedule(rel); __devlink_rel_put(rel); devlink->rel = NULL; } void devlink_rel_nested_in_clear(u32 rel_index) { xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE); } int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, u32 obj_index, devlink_rel_notify_cb_t *notify_cb, devlink_rel_cleanup_cb_t *cleanup_cb, struct devlink *devlink) { struct devlink_rel *rel = devlink_rel_alloc(); ASSERT_DEVLINK_NOT_REGISTERED(devlink); if (IS_ERR(rel)) return PTR_ERR(rel); rel->devlink_index = devlink->index; rel->nested_in.devlink_index = devlink_index; rel->nested_in.obj_index = obj_index; rel->nested_in.notify_cb = notify_cb; rel->nested_in.cleanup_cb = cleanup_cb; *rel_index = rel->index; xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink->rel = rel; return 0; } /** * devlink_rel_nested_in_notify - Notify the object this devlink * instance is nested in. * @devlink: devlink * * This is called upon network namespace change of devlink instance. * In case this devlink instance is nested in another devlink object, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. * However, since the devlink lock of nested instance is held here, * we would end with wrong devlink instance lock ordering and * deadlock. Therefore the work is utilized to avoid that. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; devlink_rel_nested_in_notify_work_schedule(rel); } static struct devlink_rel *devlink_rel_find(unsigned long rel_index) { return xa_find(&devlink_rels, &rel_index, rel_index, DEVLINK_REL_IN_USE); } static struct devlink *devlink_rel_devlink_get(u32 rel_index) { struct devlink_rel *rel; u32 devlink_index; if (!rel_index) return NULL; xa_lock(&devlink_rels); rel = devlink_rel_find(rel_index); if (rel) devlink_index = rel->devlink_index; xa_unlock(&devlink_rels); if (!rel) return NULL; return devlinks_xa_get(devlink_index); } int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, u32 rel_index, int attrtype, bool *msg_updated) { struct net *net = devlink_net(devlink); struct devlink *rel_devlink; int err; rel_devlink = devlink_rel_devlink_get(rel_index); if (!rel_devlink) return 0; err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype); devlink_put(rel_devlink); if (!err && msg_updated) *msg_updated = true; return err; } void *devlink_priv(struct devlink *devlink) { return &devlink->priv; } EXPORT_SYMBOL_GPL(devlink_priv); struct devlink *priv_to_devlink(void *priv) { return container_of(priv, struct devlink, priv); } EXPORT_SYMBOL_GPL(priv_to_devlink); struct device *devlink_to_dev(const struct devlink *devlink) { return devlink->dev; } EXPORT_SYMBOL_GPL(devlink_to_dev); struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); void devl_assert_locked(struct devlink *devlink) { lockdep_assert_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_assert_locked); #ifdef CONFIG_LOCKDEP /* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ bool devl_lock_is_held(struct devlink *devlink) { return lockdep_is_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock_is_held); #endif void devl_lock(struct devlink *devlink) { mutex_lock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock); int devl_trylock(struct devlink *devlink) { return mutex_trylock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_trylock); void devl_unlock(struct devlink *devlink) { mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_unlock); /** * devlink_try_get() - try to obtain a reference on a devlink instance * @devlink: instance to reference * * Obtain a reference on a devlink instance. A reference on a devlink instance * only implies that it's safe to take the instance lock. It does not imply * that the instance is registered, use devl_is_registered() after taking * the instance lock to check registration status. */ struct devlink *__must_check devlink_try_get(struct devlink *devlink) { if (refcount_inc_not_zero(&devlink->refcount)) return devlink; return NULL; } static void devlink_release(struct work_struct *work) { struct devlink *devlink; devlink = container_of(to_rcu_work(work), struct devlink, rwork); mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); kvfree(devlink); } void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) queue_rcu_work(system_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) { struct devlink *devlink = NULL; rcu_read_lock(); retry: devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); if (!devlink) goto unlock; if (!devlink_try_get(devlink)) goto next; if (!net_eq(devlink_net(devlink), net)) { devlink_put(devlink); goto next; } unlock: rcu_read_unlock(); return devlink; next: (*indexp)++; goto retry; } /** * devl_register - Register devlink instance * @devlink: devlink */ int devl_register(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); devl_assert_locked(devlink); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); devlink_rel_nested_in_notify(devlink); return 0; } EXPORT_SYMBOL_GPL(devl_register); void devlink_register(struct devlink *devlink) { devl_lock(devlink); devl_register(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_register); /** * devl_unregister - Unregister devlink instance * @devlink: devlink */ void devl_unregister(struct devlink *devlink) { ASSERT_DEVLINK_REGISTERED(devlink); devl_assert_locked(devlink); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_rel_put(devlink); } EXPORT_SYMBOL_GPL(devl_unregister); void devlink_unregister(struct devlink *devlink) { devl_lock(devlink); devl_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * * @ops: ops * @priv_size: size of user private data * @net: net namespace * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, size_t priv_size, struct net *net, struct device *dev) { struct devlink *devlink; static u32 last_id; int ret; WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); if (ret < 0) goto err_xa_alloc; devlink->dev = get_device(dev); devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); INIT_RCU_WORK(&devlink->rwork, devlink_release); lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); lockdep_set_class(&devlink->lock, &devlink->lock_key); refcount_set(&devlink->refcount, 1); return devlink; err_xa_alloc: kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->nested_rels); xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->params); xa_destroy(&devlink->ports); xa_erase(&devlinks, devlink->index); devlink_put(devlink); } EXPORT_SYMBOL_GPL(devlink_free); static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; static int __init devlink_init(void) { int err; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; err = genl_register_family(&devlink_nl_family); if (err) goto out_unreg_pernet_subsys; err = register_netdevice_notifier(&devlink_port_netdevice_nb); if (!err) return 0; genl_unregister_family(&devlink_nl_family); out_unreg_pernet_subsys: unregister_pernet_subsys(&devlink_pernet_ops); out: WARN_ON(err); return err; } subsys_initcall(devlink_init); |
34 20 159 45 124 16 123 21 9 19 19 17 16 9 1 4 1 7 2 2 2 2 17 3 20 2 20 3 17 18 18 2 4 4 18 18 18 18 18 18 18 8 1 1 9 1 2 6 17 1 7 3 8 42 43 7 35 26 1 4 17 8 3 15 3 3 1 9 1 3 5 5 13 2 1 7 1 4 4 1 6 2 4 2 4 22 1 1 8 5 7 21 5 1 1 3 11 2 1 8 9 5 2 3 8 1 3 4 6 3 3 5 1 1 3 1 1 1 1 6 1 3 1 3 1 3 2 1 5 1 18 6 27 1 3 1 25 26 2 1 1 1 14 1 3 8 8 6 2 4 14 1 2 11 1 1 12 2 2 2 6 6 1 1 5 6 10 226 1 2 1 2 2 222 9 10 6 6 13 13 10 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 | /* * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/completion.h> #include <linux/file.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/miscdevice.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/nospec.h> #include <rdma/rdma_user_cm.h> #include <rdma/ib_marshall.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> #include <rdma/ib_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, .proc_handler = proc_dointvec, }, }; struct ucma_file { struct mutex mut; struct file *filp; struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; }; struct ucma_context { u32 id; struct completion comp; refcount_t ref; int events_reported; atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; struct work_struct close_work; }; struct ucma_multicast { struct ucma_context *ctx; u32 id; int events_reported; u64 uid; u8 join_state; struct list_head list; struct sockaddr_storage addr; }; struct ucma_event { struct ucma_context *ctx; struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; struct rdma_ucm_event_resp resp; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { struct ucma_context *ctx; ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); if (!IS_ERR(ctx)) if (!refcount_inc_not_zero(&ctx->ref)) ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } /* * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the * CM_ID is bound. */ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) { struct ucma_context *ctx = ucma_get_ctx(file, id); if (IS_ERR(ctx)) return ctx; if (!ctx->cm_id->device) { ucma_put_ctx(ctx); return ERR_PTR(-EINVAL); } return ctx; } static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); /* Reading the cm_id without holding a positive ref is not allowed */ ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); ctx->file = file; mutex_init(&ctx->mutex); if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { kfree(ctx); return NULL; } return ctx; } static void ucma_set_ctx_cm_id(struct ucma_context *ctx, struct rdma_cm_id *cm_id) { refcount_set(&ctx->ref, 1); ctx->cm_id = cm_id; } static void ucma_finish_ctx(struct ucma_context *ctx) { lockdep_assert_held(&ctx->file->mut); list_add_tail(&ctx->list, &ctx->file->ctx_list); xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, struct rdma_conn_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct ib_device *device, struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, struct rdma_cm_event *event) { struct ucma_event *uevent; uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); if (!uevent) return NULL; uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: uevent->mc = (struct ucma_multicast *) event->param.ud.private_data; uevent->resp.uid = uevent->mc->uid; uevent->resp.id = uevent->mc->id; break; default: uevent->resp.uid = ctx->uid; uevent->resp.id = ctx->id; break; } uevent->resp.event = event->event; uevent->resp.status = event->status; if (ctx->cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); uevent->resp.ece.vendor_id = event->ece.vendor_id; uevent->resp.ece.attr_mod = event->ece.attr_mod; return uevent; } static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_context *listen_ctx = cm_id->context; struct ucma_context *ctx; struct ucma_event *uevent; if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) return -ENOMEM; ctx = ucma_alloc_ctx(listen_ctx->file); if (!ctx) goto err_backlog; ucma_set_ctx_cm_id(ctx, cm_id); uevent = ucma_create_uevent(listen_ctx, event); if (!uevent) goto err_alloc; uevent->conn_req_ctx = ctx; uevent->resp.id = ctx->id; ctx->cm_id->context = ctx; mutex_lock(&ctx->file->mut); ucma_finish_ctx(ctx); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); return 0; err_alloc: ucma_destroy_private_ctx(ctx); err_backlog: atomic_inc(&listen_ctx->backlog); /* Returning error causes the new ID to be destroyed */ return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) return ucma_connect_event_handler(cm_id, event); /* * We ignore events for new connections until userspace has set their * context. This can only happen if an error occurs on a new connection * before the user accepts it. This is okay, since the accept will just * fail later. However, we do need to release the underlying HW * resources in case of a device removal event. */ if (ctx->uid) { uevent = ucma_create_uevent(ctx, event); if (!uevent) return 0; mutex_lock(&ctx->file->mut); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); } if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { xa_lock(&ctx_table); if (xa_load(&ctx_table, ctx->id) == ctx) queue_work(system_unbound_wq, &ctx->close_work); xa_unlock(&ctx_table); } return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_get_event cmd; struct ucma_event *uevent; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->mut); while (list_empty(&file->event_list)) { mutex_unlock(&file->mut); if (file->filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, !list_empty(&file->event_list))) return -ERESTARTSYS; mutex_lock(&file->mut); } uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { mutex_unlock(&file->mut); return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); kfree(uevent); return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) { switch (cmd->ps) { case RDMA_PS_TCP: *qp_type = IB_QPT_RC; return 0; case RDMA_PS_UDP: case RDMA_PS_IPOIB: *qp_type = IB_QPT_UD; return 0; case RDMA_PS_IB: *qp_type = cmd->qp_type; return 0; default: return -EINVAL; } } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct rdma_cm_id *cm_id; enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ret = ucma_get_qp_type(&cmd, &qp_type); if (ret) return ret; ctx = ucma_alloc_ctx(file); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; goto err1; } mutex_lock(&file->mut); ucma_finish_ctx(ctx); mutex_unlock(&file->mut); return 0; err1: ucma_destroy_private_ctx(ctx); return ret; } static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); /* * At this point mc->ctx->ref is 0 so the mc cannot leave the * lock on the reader and this is enough serialization */ __xa_erase(&multicast_table, mc->id); kfree(mc); } xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; rdma_lock_handler(mc->ctx->cm_id); mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; list_del(&uevent->list); kfree(uevent); } mutex_unlock(&mc->ctx->file->mut); rdma_unlock_handler(mc->ctx->cm_id); } static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { if (uevent->ctx != ctx) continue; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, uevent->conn_req_ctx, XA_ZERO_ENTRY, GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); continue; } list_del(&uevent->list); kfree(uevent); } list_del(&ctx->list); events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); /* * If this was a listening ID then any connections spawned from it that * have not been delivered to userspace are cleaned up too. Must be done * outside any locks. */ list_for_each_entry_safe(uevent, tmp, &list, list) { ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } return events_reported; } /* * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie * the ctx is not public to the user). This either because: * - ucma_finish_ctx() hasn't been called * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) */ static int ucma_destroy_private_ctx(struct ucma_context *ctx) { int events_reported; /* * Destroy the underlying cm_id. New work queuing is prevented now by * the removal from the xarray. Once the work is cancled ref will either * be 0 because the work ran to completion and consumed the ref from the * xarray, or it will be positive because we still have the ref from the * xarray. This can also be 0 in cases where cm_id was never set */ cancel_work_sync(&ctx->close_work); if (refcount_read(&ctx->ref)) ucma_close_id(&ctx->close_work); events_reported = ucma_cleanup_ctx_events(ctx); ucma_cleanup_multicast(ctx); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, GFP_KERNEL) != NULL); mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_context *ctx; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) { if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx) ctx = ERR_PTR(-ENOENT); } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; return ret; } static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_in6(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || !cmd.addr_size || cmd.addr_size != rdma_addr_size_kss(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) || !rdma_addr_size_in6(&cmd.dst_addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) || !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr))) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_route cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); } static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (!ctx->cm_id->device) goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_ib_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iboe_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: mutex_unlock(&ctx->mutex); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); return ret; } static void ucma_query_device_addr(struct rdma_cm_id *cm_id, struct rdma_ucm_query_addr_resp *resp) { if (!cm_id->device) return; resp->node_guid = (__force __u64) cm_id->device->node_guid; resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); } static ssize_t ucma_query_addr(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; resp.src_size = rdma_addr_size(addr); memcpy(&resp.src_addr, addr, resp.src_size); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; resp.dst_size = rdma_addr_size(addr); memcpy(&resp.dst_addr, addr, resp.dst_size); ucma_query_device_addr(ctx->cm_id, &resp); if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query_path(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_path_resp *resp; int i, ret = 0; if (out_len < sizeof(*resp)) return -ENOSPC; resp = kzalloc(out_len, GFP_KERNEL); if (!resp) return -ENOMEM; resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i]; resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); } else { ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); return ret; } static ssize_t ucma_query_gid(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr_ib *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); ucma_query_device_addr(ctx->cm_id, &resp); addr = (struct sockaddr_ib *) &resp.src_addr; resp.src_size = sizeof(*addr); if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } addr = (struct sockaddr_ib *) &resp.dst_addr; resp.dst_size = sizeof(*addr); if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, NULL, (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct ucma_context *ctx; void __user *response; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; response = u64_to_user_ptr(cmd.response); ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_PATH: ret = ucma_query_path(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; default: ret = -ENOSYS; break; } mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_conn_param(struct rdma_cm_id *id, struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct rdma_ucm_connect cmd; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } mutex_lock(&ctx->mutex); ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_listen cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (cmd.backlog <= 0 || cmd.backlog > max_backlog) cmd.backlog = max_backlog; atomic_set(&ctx->backlog, cmd.backlog); mutex_lock(&ctx->mutex); ret = rdma_listen(ctx->cm_id, cmd.backlog); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); if (!ret) { /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; } rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); return ret; } static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_reject cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!cmd.reason) cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; switch (cmd.reason) { case IB_CM_REJ_CONSUMER_DEFINED: case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: break; default: return -EINVAL; } ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, cmd.reason); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_disconnect cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_init_qp_attr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_init_qp_attr cmd; struct ib_uverbs_qp_attr resp; struct ucma_context *ctx; struct ib_qp_attr qp_attr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.qp_state > IB_QPS_ERR) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); mutex_unlock(&ctx->mutex); if (ret) goto out; ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: ucma_put_ctx(ctx); return ret; } static int ucma_set_option_id(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret = 0; switch (optname) { case RDMA_OPTION_ID_TOS: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; case RDMA_OPTION_ID_REUSEADDR: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_AFONLY: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_ACK_TIMEOUT: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_ib_path(struct ucma_context *ctx, struct ib_path_rec_data *path_data, size_t optlen) { struct sa_path_rec sa_path; struct rdma_cm_event event; int ret; if (optlen % sizeof(*path_data)) return -EINVAL; for (; optlen; optlen -= sizeof(*path_data), path_data++) { if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL)) break; } if (!optlen) return -EINVAL; if (!ctx->cm_id->device) return -EINVAL; memset(&sa_path, 0, sizeof(sa_path)); sa_path.rec_type = SA_PATH_REC_TYPE_IB; ib_sa_unpack_path(path_data->path_rec, &sa_path); if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) { struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); mutex_unlock(&ctx->mutex); } if (ret) return ret; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; return ucma_event_handler(ctx->cm_id, &event); } static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { int ret; switch (level) { case RDMA_OPTION_ID: mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_set_option cmd; struct ucma_context *ctx; void *optval; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); optval = memdup_user(u64_to_user_ptr(cmd.optval), cmd.optlen); if (IS_ERR(optval)) { ret = PTR_ERR(optval); goto out; } ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); kfree(optval); out: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_notify cmd; struct ucma_context *ctx; int ret = -EINVAL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_process_join(struct ucma_file *file, struct rdma_ucm_join_mcast *cmd, int out_len) { struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; struct sockaddr *addr; int ret; u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; addr = (struct sockaddr *) &cmd->addr; if (cmd->addr_size != rdma_addr_size(addr)) return -EINVAL; if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) join_state = BIT(FULLMEMBER_JOIN); else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) join_state = BIT(SENDONLY_FULLMEMBER_JOIN); else return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; goto err_put_ctx; } mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); xa_lock(&multicast_table); if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) { ret = -ENOMEM; goto err_free_mc; } list_add_tail(&mc->list, &ctx->mc_list); xa_unlock(&multicast_table); mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); mutex_unlock(&ctx->mutex); if (ret) goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); ucma_put_ctx(ctx); return 0; err_leave_multicast: mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); err_xa_erase: xa_lock(&multicast_table); list_del(&mc->list); __xa_erase(&multicast_table, mc->id); err_free_mc: xa_unlock(&multicast_table); kfree(mc); err_put_ctx: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_join_ip_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_ip_mcast cmd; struct rdma_ucm_join_mcast join_cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; join_cmd.response = cmd.response; join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr); if (!join_cmd.addr_size) return -EINVAL; join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); return ucma_process_join(file, &join_cmd, out_len); } static ssize_t ucma_join_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_mcast cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_kss(&cmd.addr)) return -EINVAL; return ucma_process_join(file, &cmd, out_len); } static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_multicast *mc; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&multicast_table); mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); if (IS_ERR(mc)) { xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } list_del(&mc->list); __xa_erase(&multicast_table, mc->id); xa_unlock(&multicast_table); mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); ucma_cleanup_mc_events(mc); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; kfree(mc); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: return ret; } static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; struct ucma_event *uevent, *tmp; struct ucma_context *ctx; LIST_HEAD(event_list); struct fd f; struct ucma_file *cur_file; int ret = 0; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; /* Get current fd to protect against it being closed */ f = fdget(cmd.fd); if (!f.file) return -ENOENT; if (f.file->f_op != &ucma_fops) { ret = -EINVAL; goto file_put; } cur_file = f.file->private_data; /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } rdma_lock_handler(ctx->cm_id); /* * ctx->file can only be changed under the handler & xa_lock. xa_load() * must be checked again to ensure the ctx hasn't begun destruction * since the ucma_get_ctx(). */ xa_lock(&ctx_table); if (_ucma_find_context(cmd.id, cur_file) != ctx) { xa_unlock(&ctx_table); ret = -ENOENT; goto err_unlock; } ctx->file = new_file; xa_unlock(&ctx_table); mutex_lock(&cur_file->mut); list_del(&ctx->list); /* * At this point lock_handler() prevents addition of new uevents for * this ctx. */ list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) if (uevent->ctx == ctx) list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; mutex_unlock(&cur_file->mut); mutex_lock(&new_file->mut); list_add_tail(&ctx->list, &new_file->ctx_list); list_splice_tail(&event_list, &new_file->event_list); mutex_unlock(&new_file->mut); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; err_unlock: rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); file_put: fdput(f); return ret; } static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, [RDMA_USER_CM_CMD_REJECT] = ucma_reject, [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, [RDMA_USER_CM_CMD_GET_OPTION] = NULL, [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct ucma_file *file = filp->private_data; struct rdma_ucm_cmd_hdr hdr; ssize_t ret; if (!ib_safe_file_access(filp)) { pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", __func__, task_tgid_vnr(current), current->comm); return -EACCES; } if (len < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table)); if (hdr.in + sizeof(hdr) > len) return -EINVAL; if (!ucma_cmd_table[hdr.cmd]) return -ENOSYS; ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); if (!ret) ret = len; return ret; } static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait) { struct ucma_file *file = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file->poll_wait, wait); if (!list_empty(&file->event_list)) mask = EPOLLIN | EPOLLRDNORM; return mask; } /* * ucma_open() does not need the BKL: * * - no global state is referred to; * - there is no ioctl method to race against; * - no further module initialization is required for open to work * after the device is registered. */ static int ucma_open(struct inode *inode, struct file *filp) { struct ucma_file *file; file = kmalloc(sizeof *file, GFP_KERNEL); if (!file) return -ENOMEM; INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); mutex_init(&file->mut); filp->private_data = file; file->filp = filp; return stream_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; /* * All paths that touch ctx_list or ctx_list starting from write() are * prevented by this being a FD release function. The list_add_tail() in * ucma_connect_event_handler() can run concurrently, however it only * adds to the list *after* a listening ID. By only reading the first of * the list, and relying on ucma_destroy_private_ctx() to block * ucma_connect_event_handler(), no additional locking is needed. */ while (!list_empty(&file->ctx_list)) { struct ucma_context *ctx = list_first_entry( &file->ctx_list, struct ucma_context, list); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx); ucma_destroy_private_ctx(ctx); } kfree(file); return 0; } static const struct file_operations ucma_fops = { .owner = THIS_MODULE, .open = ucma_open, .release = ucma_close, .write = ucma_write, .poll = ucma_poll, .llseek = no_llseek, }; static struct miscdevice ucma_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, .fops = &ucma_fops, }; static int ucma_get_global_nl_info(struct ib_client_nl_info *res) { res->abi = RDMA_USER_CM_ABI_VERSION; res->cdev = ucma_misc.this_device; return 0; } static struct ib_client rdma_cma_client = { .name = "rdma_cm", .get_global_nl_info = ucma_get_global_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); static ssize_t abi_version_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { int ret; ret = misc_register(&ucma_misc); if (ret) return ret; ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); if (!ucma_ctl_table_hdr) { pr_err("rdma_ucm: couldn't register sysctl paths\n"); ret = -ENOMEM; goto err2; } ret = ib_register_client(&rdma_cma_client); if (ret) goto err3; return 0; err3: unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); } module_init(ucma_init); module_exit(ucma_cleanup); |
62 62 35 23 3 55 1 41 8 2 4 3 44 44 2 22 21 41 38 8 35 8 4 35 35 35 281 3 1792 85 1749 72 60 56 57 57 57 57 63 62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly for connection tracking * * Copyright (C)2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "IPv6-nf: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6_frag.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netns/generic.h> static const char nf_frags_cache_name[] = "nf-frags"; static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) { return net_generic(net, nf_frag_pernet_id); } #ifdef CONFIG_SYSCTL static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "nf_conntrack_frag6_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static int nf_ct_frag6_sysctl_register(struct net *net) { struct nft_ct_frag6_pernet *nf_frag; struct ctl_table *table; struct ctl_table_header *hdr; table = nf_ct_frag6_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; } nf_frag = nf_frag_pernet(net); table[0].data = &nf_frag->fqdir->timeout; table[1].data = &nf_frag->fqdir->low_thresh; table[1].extra2 = &nf_frag->fqdir->high_thresh; table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; hdr = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_ct_frag6_sysctl_table)); if (hdr == NULL) goto err_reg; nf_frag->nf_frag_frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else static int nf_ct_frag6_sysctl_register(struct net *net) { return 0; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { } #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, const struct ipv6hdr *hdr, int iif) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = user, .iif = iif, }; struct inet_frag_queue *q; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL))) key.iif = 0; q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { unsigned int payload_len; struct net_device *dev; struct sk_buff *prev; int offset, end, err; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); goto err; } payload_len = ntohs(ipv6_hdr(skb)->payload_len); offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (payload_len - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } fq->q.len = end; } } if (end == offset) goto err; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { pr_debug("queue: message is too short.\n"); goto err; } if (pskb_trim_rcsum(skb, end - offset)) { pr_debug("Can't trim\n"); goto err; } /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) { if (err == IPFRAG_DUP) { /* No error for duplicates, pretend they got queued. */ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG); return -EINPROGRESS; } goto insert_error; } if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = nf_ct_frag6_reasm(fq, skb, prev, dev); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or * -EINPROGRESS must be returned. */ return err ? -EINPROGRESS : 0; } skb_dst_drop(skb); skb_orphan(skb); return -EINPROGRESS; insert_error: inet_frag_kill(&fq->q); err: skb_dst_drop(skb); return -EINVAL; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto err; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto err; payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); goto err; } /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->ignore_df = 1; skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_partial(skb_network_header(skb), skb_network_header_len(skb), skb->csum); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 0; err: inet_frag_kill(&fq->q); return -EINVAL; } /* * find the header just before Fragment Header. * * if success return 0 and set ... * (*prevhdrp): the value of "Next Header Field" in the header * just before Fragment Header. * (*prevhoff): the offset of "Next Header Field" in the header * just before Fragment Header. * (*fhoff) : the offset of Fragment Header. * * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c * */ static int find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) { u8 nexthdr = ipv6_hdr(skb)->nexthdr; const int netoff = skb_network_offset(skb); u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); int start = netoff + sizeof(struct ipv6hdr); int len = skb->len - start; u8 prevhdr = NEXTHDR_IPV6; while (nexthdr != NEXTHDR_FRAGMENT) { struct ipv6_opt_hdr hdr; int hdrlen; if (!ipv6_ext_hdr(nexthdr)) { return -1; } if (nexthdr == NEXTHDR_NONE) { pr_debug("next header is none\n"); return -1; } if (len < (int)sizeof(struct ipv6_opt_hdr)) { pr_debug("too short\n"); return -1; } if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) BUG(); if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(&hdr); else hdrlen = ipv6_optlen(&hdr); prevhdr = nexthdr; prev_nhoff = start; nexthdr = hdr.nexthdr; len -= hdrlen; start += hdrlen; } if (len < 0) return -1; *prevhdrp = prevhdr; *prevhoff = prev_nhoff; *fhoff = start; return 0; } int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; u8 prevhdr; /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { pr_debug("payload len = 0\n"); return 0; } if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; /* Discard the first fragment if it does not include all headers * RFC 8200, Section 4.5 */ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { pr_debug("Drop incomplete fragment\n"); return 0; } if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; skb_set_transport_header(skb, fhoff); hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); int res; res = fqdir_init(&nf_frag->fqdir, &nf_frags, net); if (res < 0) return res; nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = nf_ct_frag6_sysctl_register(net); if (res < 0) fqdir_exit(nf_frag->fqdir); return res; } static void nf_ct_net_pre_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); fqdir_pre_exit(nf_frag->fqdir); } static void nf_ct_net_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); nf_ct_frags6_sysctl_unregister(net); fqdir_exit(nf_frag->fqdir); } static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .pre_exit = nf_ct_net_pre_exit, .exit = nf_ct_net_exit, .id = &nf_frag_pernet_id, .size = sizeof(struct nft_ct_frag6_pernet), }; static const struct rhashtable_params nfct_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int nf_ct_frag6_init(void) { int ret = 0; nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; ret = register_pernet_subsys(&nf_ct_net_ops); if (ret) inet_frags_fini(&nf_frags); out: return ret; } void nf_ct_frag6_cleanup(void) { unregister_pernet_subsys(&nf_ct_net_ops); inet_frags_fini(&nf_frags); } |
432 430 431 434 432 33673 33663 33663 33678 54 54 54 6 6 227 11 225 227 208 226 225 2 2 3 3 1 2 176 54 208 208 383 176 25 196 208 206 208 206 208 72 72 72 72 1112 1115 1115 1113 1114 1117 1115 35130 60090 35103 50332 771 1338 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <uapi/asm/kvm.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ struct fpstate init_fpstate __ro_after_init; /* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* In kernel FPU usage already active? */ if (this_cpu_read(in_kernel_fpu)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_hardirq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * Track AVX512 state use because it is known to slow the max clock * speed of the core. */ static void update_avx_timestamp(struct fpu *fpu) { #define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } /* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. * * The legacy FNSAVE instruction clears all FPU state unconditionally, so * register state has to be reloaded. That might be a pointless exercise * when the FPU is going to be used by another task right after that. But * this only affects 20+ years old 32bit systems and avoids conditionals all * over the place. * * FXSAVE and all XSAVE variants preserve the FPU register state. */ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { fxsave(&fpu->fpstate->regs.fxsave); return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore * FDP/FIP/FOP unless an exception is pending. Clear the x87 state * here by setting it to fixed values. "m" is a random variable * that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %[addr]" /* set F?P to defined value */ : : [addr] "m" (*fpstate)); } if (use_xsave()) { /* * Dynamically enabled features are enabled in XCR0, but * usage requires also that the corresponding bits in XFD * are cleared. If the bits are set then using a related * instruction will raise #NM. This allows to do the * allocation of the larger FPU buffer lazy from #NM or if * the task has no permission to kill it which would happen * via #UD if the feature is disabled in XCR0. * * XFD state is following the same life time rules as * XSTATE and to restore state correctly XFD has to be * updated before XRSTORS otherwise the component would * stay in or go into init state even if the bits are set * in fpstate::regs::xsave::xfeatures. */ xfd_update_state(fpstate); /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use * extended features. * * So fpstate->xfeatures cannot be used here, because then * a feature for which the task has no permission but was * used by the previous task would not go into init state. */ mask = fpu_kernel_cfg.max_features & mask; os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); else frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); static void fpu_init_guest_permissions(struct fpu_guest *gfpu) { struct fpu_state_perm *fpuperm; u64 perm; if (!IS_ENABLED(CONFIG_X86_64)) return; spin_lock_irq(¤t->sighand->siglock); fpuperm = ¤t->group_leader->thread.fpu.guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate; unsigned int size; size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; /* Leave xfd to 0 (the reset value defined by spec) */ __fpstate_reset(fpstate, 0); fpstate_init_user(fpstate); fpstate->is_valloc = true; fpstate->is_guest = true; gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state * to userspace, even when XSAVE is unsupported, so that restoring FPU * state on a different CPU that does support XSAVE can cleanly load * the incoming state using its natural XSAVE. In other words, KVM's * uABI size may be larger than this host's default size. Conversely, * the default size should never be larger than KVM's base uABI size; * all features that can expand the uABI size must be opt-in. */ gfpu->uabi_size = sizeof(struct kvm_xsave); if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; fpu_init_guest_permissions(gfpu); return true; } EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fps = gfpu->fpstate; if (!fps) return; if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) return; gfpu->fpstate = NULL; vfree(fps); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable * @guest_fpu: Pointer to the guest FPU container * @xfeatures: Features requested by guest CPUID * * Enable all dynamic xfeatures according to guest perm and requested CPUID. * * Return: 0 on success, error code otherwise */ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) { lockdep_assert_preemption_enabled(); /* Nothing to do if all requested features are already enabled. */ xfeatures &= ~guest_fpu->xfeatures; if (!xfeatures) return 0; return __xfd_enable_feature(xfeatures, guest_fpu); } EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { fpregs_lock(); guest_fpu->fpstate->xfd = xfd; if (guest_fpu->fpstate->in_use) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is * enabled for the price of a then pointless MSR read. */ void fpu_sync_guest_vmexit_xfd_state(void) { struct fpstate *fps = current->thread.fpu.fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { rdmsrl(MSR_IA32_XFD, fps->xfd); __this_cpu_write(xfd_state, fps->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; struct fpu *fpu = ¤t->thread.fpu; struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; fpu->fpstate = guest_fps; guest_fps->in_use = true; } else { guest_fps->in_use = false; fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } cur_fps = fpu->fpstate; if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); } else { /* * XSTATE is restored by firmware from encrypted * memory. Make sure XFD state is correct while * running with guest fpstate */ xfd_update_state(cur_fps); } fpregs_mark_activate(); fpregs_unlock(); return 0; } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). */ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) vpkru = NULL; return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. */ void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); } static inline void fpstate_init_fxstate(struct fpstate *fpstate) { fpstate->regs.fxsave.cwd = 0x37f; fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fpstate *fpstate) { fpstate->regs.fsave.cwd = 0xffff037fu; fpstate->regs.fsave.swd = 0xffff0000u; fpstate->regs.fsave.twd = 0xffffffffu; fpstate->regs.fsave.fos = 0xffff0000u; } /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_init_fpstate_user() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&fpstate->regs.soft); return; } xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); else fpstate_init_fstate(fpstate); } static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) { /* Initialize sizes and feature masks */ fpstate->size = fpu_kernel_cfg.default_size; fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate, init_fpstate.xfd); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; /* Same defaults for guests */ fpu->guest_perm = fpu->perm; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { struct fpu *src_fpu = ¤t->group_leader->thread.fpu; spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(¤t->sighand->siglock); } } /* A passed ssp of zero will not cause any update */ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) { #ifdef CONFIG_X86_USER_SHADOW_STACK struct cet_user_state *xstate; /* If ssp update is not needed. */ if (!ssp) return 0; xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, XFEATURE_CET_USER); /* * If there is a non-zero ssp, then 'dst' must be configured with a shadow * stack and the fpu state should be up to date since it was just copied * from the parent in fpu_clone(). So there must be a valid non-init CET * state location in the buffer. */ if (WARN_ON_ONCE(!xstate)) return 1; xstate->user_ssp = (u64)ssp; #endif return 0; } /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; fpstate_reset(dst_fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* * Enforce reload for user space tasks and prevent kernel threads * from trying to save the FPU registers on context switch. */ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* * No FPU state inheritance for kernel threads and IO * worker threads. */ if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } /* * If a new feature is added, ensure all dynamic features are * caller-saved from here! */ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- * saved, which enables skipping both the expansion of fpstate * and the copying of any dynamic state. * * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); /* * Children never inherit PASID state. * Force it to have its init value: */ if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; /* * Update shadow stack pointer, in case it changed during clone. */ if (update_fpu_shstk(dst, ssp)) return 1; trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Whitelist the FPU register state embedded into task_struct for hardened * usercopy. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_cfg.default_size; } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == ¤t->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else frstor(&init_fpstate.regs.fsave); pkru_write_default(); } /* * Reset current->fpu memory state to the init values. */ static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a * subsequent return to usermode will reload the registers from the * task's memory image. * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. * * PKRU handling does not rely on the xstate when restoring for * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* * Reset current's user FPU states to the init states. current's * supervisor states, if any, are not modified by this function. The * caller guarantees that the XSTATE header in memory is intact. */ void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpu_reset_fpregs(); fpregs_unlock(); return; } /* * Ensure that current's supervisor states are loaded into their * corresponding registers. */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU * state machine that current's FPU registers are in the hardware * registers. The memory image does not need to be updated because * any operation relying on it has to save the registers first when * current's FPU is marked active. */ fpregs_mark_activate(); fpregs_unlock(); } void fpu_flush_thread(void) { fpstate_reset(¤t->thread.fpu); fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); void fpregs_lock_and_load(void) { /* * fpregs_lock() only disables preemption (mostly). So modifying state * in an interrupt could screw up some in progress fpregs operation. * Warn about it. */ WARN_ON_ONCE(!irq_fpu_usable()); WARN_ON_ONCE(current->flags & PF_KTHREAD); fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); } #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = ¤t->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = ¤t->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->fpstate->regs.fxsave.cwd; swd = fpu->fpstate->regs.fxsave.swd; } else { cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; } /* * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } } |
5 5 5 5 5 955 956 157 158 402 404 51 51 51 51 5 6 6 6 25586 25684 25566 25580 3043 3049 3042 22 1001 341 1987 208 208 565 565 135 135 1347 1344 132 468 343 343 153 152 101 101 441 440 443 91 90 170 169 331 330 5474 81 126 5300 5290 5291 9200 9201 5957 5950 11456 11530 67 10806 115 699 602 1197 1196 90 91 68 45 46 46 46 22 22 22 3760 3773 3762 54 54 52 78 78 78 26 26 26 8 1 5 1 1 5 5 5 1 4 3 1 2 5 36 2 19 14 3 3 3 7 5 1 3 6 3 3 1 2 34 33 6257 6267 28 28 27 63 63 1 1 62 63 62 7 7 1562 1562 1565 987 988 986 36 8 29 36 36 36 1283 1284 1284 1127 440 708 63 1127 1130 1125 1051 1053 1047 1046 1051 1142 1139 1137 1140 1143 166 168 165 168 171 172 171 171 172 18130 18123 18122 18119 18121 16892 1982 352 351 355 355 68 54 6053 6050 6038 6058 1996 4257 232 15490 15469 1 1 12959 67 20 47 145 145 121 120 121 120 121 121 121 121 120 2219 2221 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; tracee = __begin_current_label_crit_section(); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { struct aa_ruleset *rules; if (COMPLAIN_MODE(profile)) continue; rules = list_first_entry(&profile->rules, typeof(*rules), list); *effective = cap_intersect(*effective, rules->caps.allow); *permitted = cap_intersect(*permitted, rules->caps.allow); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label_condref(label, needput); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; /* don't reaudit files closed during inheritance */ if (file->f_path.dentry == aa_null.dentry) return -EACCES; label = __begin_current_label_crit_section(); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = list_first_entry(&profile->rules, typeof(*rules), list); state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; label = __begin_current_label_crit_section(); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, u32 *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getsecid_subj(u32 *secid) { struct aa_label *label = __begin_current_label_crit_section(); *secid = label->secid; __end_current_label_crit_section(label); } static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) { struct aa_label *label = aa_get_task_label(p); *secid = label->secid; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label = __begin_current_label_crit_section(); int error = 0; if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) { struct aa_sk_ctx *ctx; ctx = kzalloc(sizeof(*ctx), flags); if (!ctx) return -ENOMEM; sk->sk_security = ctx; return 0; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); sk->sk_security = NULL; aa_put_label(ctx->label); aa_put_label(ctx->peer); kfree(ctx); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); if (new->label) aa_put_label(new->label); new->label = aa_get_label(ctx->label); if (new->peer) aa_put_label(new->peer); new->peer = aa_get_label(ctx->peer); } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); label = begin_current_label_crit_section(); if (!(kern || unconfined(label))) error = af_select(family, create_perm(label, family, type, protocol), aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol)); end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); aa_put_label(ctx->label); ctx->label = aa_get_label(label); } aa_put_label(label); return 0; } static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, bind_perm(sock, address, addrlen), aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, connect_perm(sock, address, addrlen), aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, listen_perm(sock, backlog), aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, accept_perm(sock, newsock), aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk)); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, msg_perm(op, request, sock, msg, size), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, sock_perm(op, request, sock), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); return af_select(sock->sk->sk_family, opt_perm(op, request, sock, level, optname), aa_sk_perm(op, request, sock->sk)); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; /* * If reach here before socket_post_create hook is called, in which * case label is null, drop the packet. */ if (!ctx->label) return -EACCES; return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } #endif static struct aa_label *sk_peer_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); if (ctx->peer) return ctx->peer; return ERR_PTR(-ENOPROTOOPT); } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; label = begin_current_label_crit_section(); peer = sk_peer_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done: end_current_label_crit_section(label); kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!ctx->label) ctx->label = aa_get_current_label(); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); if (!skb->secmark) return 0; return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES); module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremly high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk)) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } __initcall(apparmor_nf_ip_init); #endif static char nulldfa_src[] = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .name = "apparmor", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, }; |
32 32 27 15 188 189 100 189 188 188 191 90 5 89 89 91 89 91 98 97 98 98 35 97 4 1 98 2 98 4 97 96 97 4 1 4 4 4 4 14 1 13 13 13 1 5 8 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | // SPDX-License-Identifier: GPL-2.0 /* * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} * * In fact, that's a piece of procfs; it's *almost* isolated from * the rest of fs/proc, but has rather close relationships with * fs/namespace.c, thus here instead of fs/proc * */ #include <linux/mnt_namespace.h> #include <linux/nsproxy.h> #include <linux/security.h> #include <linux/fs_struct.h> #include <linux/sched/task.h> #include "proc/internal.h" /* only for get_proc_task() in ->open() */ #include "pnode.h" #include "internal.h" static __poll_t mounts_poll(struct file *file, poll_table *wait) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; __poll_t res = EPOLLIN | EPOLLRDNORM; int event; poll_wait(file, &p->ns->poll, wait); event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= EPOLLERR | EPOLLPRI; } return res; } struct proc_fs_opts { int flag; const char *str; }; static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_opts fs_opts[] = { { SB_SYNCHRONOUS, ",sync" }, { SB_DIRSYNC, ",dirsync" }, { SB_MANDLOCK, ",mand" }, { SB_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) { if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } return security_sb_show_options(m, sb); } static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt) { static const struct proc_fs_opts mnt_opts[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) { if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } if (is_idmapped_mnt(mnt)) seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) { seq_escape(m, s, " \t\n\\#"); } static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } } static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { mangle(m, r->mnt_devname ? r->mnt_devname : "none"); } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); show_type(m, sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; show_vfsmnt_opts(m, mnt); if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt_path.dentry); seq_puts(m, " 0 0\n"); out: return err; } static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); err = show_path(m, mnt->mnt_root); if (err) goto out; seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_vfsmnt_opts(m, mnt); /* Tagged fields ("foo:X" or "bar") */ if (IS_MNT_SHARED(r)) seq_printf(m, " shared:%i", r->mnt_group_id); if (IS_MNT_SLAVE(r)) { int master = r->mnt_master->mnt_group_id; int dom = get_dominating_id(r, &p->root); seq_printf(m, " master:%i", master); if (dom && dom != master) seq_printf(m, " propagate_from:%i", dom); } if (IS_MNT_UNBINDABLE(r)) seq_puts(m, " unbindable"); /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt->mnt_root); if (err) goto out; } else { mangle(m, r->mnt_devname ? r->mnt_devname : "none"); } seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt->mnt_root); seq_putc(m, '\n'); out: return err; } static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; /* device */ if (sb->s_op->show_devname) { seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { if (r->mnt_devname) { seq_puts(m, "device "); mangle(m, r->mnt_devname); } else seq_puts(m, "no device"); } /* mount point */ seq_puts(m, " mounted on "); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); /* file system type */ seq_puts(m, "with fstype "); show_type(m, sb); /* optional statistics */ if (sb->s_op->show_stats) { seq_putc(m, ' '); err = sb->s_op->show_stats(m, mnt_path.dentry); } seq_putc(m, '\n'); out: return err; } static int mounts_open_common(struct inode *inode, struct file *file, int (*show)(struct seq_file *, struct vfsmount *)) { struct task_struct *task = get_proc_task(inode); struct nsproxy *nsp; struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; struct seq_file *m; int ret = -EINVAL; if (!task) goto err; task_lock(task); nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); if (!task->fs) { task_unlock(task); put_task_struct(task); ret = -ENOENT; goto err_put_ns; } get_fs_root(task->fs, &root); task_unlock(task); put_task_struct(task); ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); if (ret) goto err_put_path; m = file->private_data; m->poll_event = ns->event; p = m->private; p->ns = ns; p->root = root; p->show = show; return 0; err_put_path: path_put(&root); err_put_ns: put_mnt_ns(ns); err: return ret; } static int mounts_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsmnt); } static int mountinfo_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_mountinfo); } static int mountstats_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsstat); } const struct file_operations proc_mounts_operations = { .open = mounts_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountinfo_operations = { .open = mountinfo_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, }; |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 19 19 15 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 | // SPDX-License-Identifier: GPL-2.0 /* * thermal.c - Generic Thermal Management Sysfs support. * * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/kdev_t.h> #include <linux/idr.h> #include <linux/list_sort.h> #include <linux/thermal.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/of.h> #include <linux/suspend.h> #define CREATE_TRACE_POINTS #include "thermal_trace.h" #include "thermal_core.h" #include "thermal_hwmon.h" static DEFINE_IDA(thermal_tz_ida); static DEFINE_IDA(thermal_cdev_ida); static LIST_HEAD(thermal_tz_list); static LIST_HEAD(thermal_cdev_list); static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); static struct thermal_governor *def_governor; /* * Governor section: set of functions to handle thermal governors * * Functions to help in the life cycle of thermal governors within * the thermal core and by the thermal governor code. */ static struct thermal_governor *__find_governor(const char *name) { struct thermal_governor *pos; if (!name || !name[0]) return def_governor; list_for_each_entry(pos, &thermal_governor_list, governor_list) if (!strncasecmp(name, pos->name, THERMAL_NAME_LENGTH)) return pos; return NULL; } /** * bind_previous_governor() - bind the previous governor of the thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @failed_gov_name: the name of the governor that failed to register * * Register the previous governor of the thermal zone after a new * governor has failed to be bound. */ static void bind_previous_governor(struct thermal_zone_device *tz, const char *failed_gov_name) { if (tz->governor && tz->governor->bind_to_tz) { if (tz->governor->bind_to_tz(tz)) { dev_err(&tz->device, "governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n", failed_gov_name, tz->governor->name, tz->type); tz->governor = NULL; } } } /** * thermal_set_governor() - Switch to another governor * @tz: a valid pointer to a struct thermal_zone_device * @new_gov: pointer to the new governor * * Change the governor of thermal zone @tz. * * Return: 0 on success, an error if the new governor's bind_to_tz() failed. */ static int thermal_set_governor(struct thermal_zone_device *tz, struct thermal_governor *new_gov) { int ret = 0; if (tz->governor && tz->governor->unbind_from_tz) tz->governor->unbind_from_tz(tz); if (new_gov && new_gov->bind_to_tz) { ret = new_gov->bind_to_tz(tz); if (ret) { bind_previous_governor(tz, new_gov->name); return ret; } } tz->governor = new_gov; return ret; } int thermal_register_governor(struct thermal_governor *governor) { int err; const char *name; struct thermal_zone_device *pos; if (!governor) return -EINVAL; mutex_lock(&thermal_governor_lock); err = -EBUSY; if (!__find_governor(governor->name)) { bool match_default; err = 0; list_add(&governor->governor_list, &thermal_governor_list); match_default = !strncmp(governor->name, DEFAULT_THERMAL_GOVERNOR, THERMAL_NAME_LENGTH); if (!def_governor && match_default) def_governor = governor; } mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { /* * only thermal zones with specified tz->tzp->governor_name * may run with tz->govenor unset */ if (pos->governor) continue; name = pos->tzp->governor_name; if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) { int ret; ret = thermal_set_governor(pos, governor); if (ret) dev_err(&pos->device, "Failed to set governor %s for thermal zone %s: %d\n", governor->name, pos->type, ret); } } mutex_unlock(&thermal_list_lock); mutex_unlock(&thermal_governor_lock); return err; } void thermal_unregister_governor(struct thermal_governor *governor) { struct thermal_zone_device *pos; if (!governor) return; mutex_lock(&thermal_governor_lock); if (!__find_governor(governor->name)) goto exit; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { if (!strncasecmp(pos->governor->name, governor->name, THERMAL_NAME_LENGTH)) thermal_set_governor(pos, NULL); } mutex_unlock(&thermal_list_lock); list_del(&governor->governor_list); exit: mutex_unlock(&thermal_governor_lock); } int thermal_zone_device_set_policy(struct thermal_zone_device *tz, char *policy) { struct thermal_governor *gov; int ret = -EINVAL; mutex_lock(&thermal_governor_lock); mutex_lock(&tz->lock); gov = __find_governor(strim(policy)); if (!gov) goto exit; ret = thermal_set_governor(tz, gov); exit: mutex_unlock(&tz->lock); mutex_unlock(&thermal_governor_lock); thermal_notify_tz_gov_change(tz, policy); return ret; } int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; mutex_lock(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { count += sysfs_emit_at(buf, count, "%s ", pos->name); } count += sysfs_emit_at(buf, count, "\n"); mutex_unlock(&thermal_governor_lock); return count; } static void __init thermal_unregister_governors(void) { struct thermal_governor **governor; for_each_governor_table(governor) thermal_unregister_governor(*governor); } static int __init thermal_register_governors(void) { int ret = 0; struct thermal_governor **governor; for_each_governor_table(governor) { ret = thermal_register_governor(*governor); if (ret) { pr_err("Failed to register governor: '%s'", (*governor)->name); break; } pr_info("Registered thermal governor '%s'", (*governor)->name); } if (ret) { struct thermal_governor **gov; for_each_governor_table(gov) { if (gov == governor) break; thermal_unregister_governor(*gov); } } return ret; } static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { if (tz->ops.change_mode) { int ret; ret = tz->ops.change_mode(tz, mode); if (ret) return ret; } tz->mode = mode; return 0; } static void thermal_zone_broken_disable(struct thermal_zone_device *tz) { struct thermal_trip_desc *td; dev_err(&tz->device, "Unable to get temperature, disabling!\n"); /* * This function only runs for enabled thermal zones, so no need to * check for the current mode. */ __thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); thermal_notify_tz_disable(tz); for_each_trip_desc(tz, td) { if (td->trip.type == THERMAL_TRIP_CRITICAL && td->trip.temperature > THERMAL_TEMP_INVALID) { dev_crit(&tz->device, "Disabled thermal zone with critical trip point\n"); return; } } } /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. * Same update may be done on a zone by calling thermal_zone_device_update(). * * An update means: * - Non-critical trips will invoke the governor responsible for that zone; * - Hot trips will produce a notification to userspace; * - Critical trip point will cause a system shutdown. */ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, unsigned long delay) { if (delay) mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, delay); else cancel_delayed_work(&tz->poll_queue); } static void thermal_zone_recheck(struct thermal_zone_device *tz, int error) { if (error == -EAGAIN) { thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY); return; } /* * Print the message once to reduce log noise. It will be followed by * another one if the temperature cannot be determined after multiple * attempts. */ if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY) dev_info(&tz->device, "Temperature check failed (%d)\n", error); thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies); tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL); if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) { thermal_zone_broken_disable(tz); /* * Restore the original recheck delay value to allow the thermal * zone to try to recover when it is reenabled by user space. */ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; } } static void monitor_thermal_zone(struct thermal_zone_device *tz) { if (tz->mode != THERMAL_DEVICE_ENABLED) thermal_zone_device_set_polling(tz, 0); else if (tz->passive > 0) thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies); else if (tz->polling_delay_jiffies) thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies); } static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz) { if (tz->governor) return tz->governor; return def_governor; } void thermal_governor_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { if (!tz->governor || !tz->governor->update_tz) return; tz->governor->update_tz(tz, reason); } static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown) { /* * poweroff_delay_ms must be a carefully profiled positive value. * Its a must for forced_emergency_poweroff_work to be scheduled. */ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; const char *msg = "Temperature too high"; dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type); if (shutdown) hw_protection_shutdown(msg, poweroff_delay_ms); else hw_protection_reboot(msg, poweroff_delay_ms); } void thermal_zone_device_critical(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, true); } EXPORT_SYMBOL(thermal_zone_device_critical); void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, false); } static void handle_critical_trips(struct thermal_zone_device *tz, const struct thermal_trip *trip) { trace_thermal_zone_trip(tz, thermal_zone_trip_id(tz, trip), trip->type); if (trip->type == THERMAL_TRIP_CRITICAL) tz->ops.critical(tz); else if (tz->ops.hot) tz->ops.hot(tz); } static void handle_thermal_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct list_head *way_up_list, struct list_head *way_down_list) { const struct thermal_trip *trip = &td->trip; int old_threshold; if (trip->temperature == THERMAL_TEMP_INVALID) return; /* * If the trip temperature or hysteresis has been updated recently, * the threshold needs to be computed again using the new values. * However, its initial value still reflects the old ones and that * is what needs to be compared with the previous zone temperature * to decide which action to take. */ old_threshold = td->threshold; td->threshold = trip->temperature; if (tz->last_temperature >= old_threshold && tz->last_temperature != THERMAL_TEMP_INIT) { /* * Mitigation is under way, so it needs to stop if the zone * temperature falls below the low temperature of the trip. * In that case, the trip temperature becomes the new threshold. */ if (tz->temperature < trip->temperature - trip->hysteresis) { list_add(&td->notify_list_node, way_down_list); td->notify_temp = trip->temperature - trip->hysteresis; if (trip->type == THERMAL_TRIP_PASSIVE) { tz->passive--; WARN_ON(tz->passive < 0); } } else { td->threshold -= trip->hysteresis; } } else if (tz->temperature >= trip->temperature) { /* * There is no mitigation under way, so it needs to be started * if the zone temperature exceeds the trip one. The new * threshold is then set to the low temperature of the trip. */ list_add_tail(&td->notify_list_node, way_up_list); td->notify_temp = trip->temperature; td->threshold -= trip->hysteresis; if (trip->type == THERMAL_TRIP_PASSIVE) tz->passive++; else if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT) handle_critical_trips(tz, trip); } } static void thermal_zone_device_check(struct work_struct *work) { struct thermal_zone_device *tz = container_of(work, struct thermal_zone_device, poll_queue.work); thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_instance *pos; INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check); tz->temperature = THERMAL_TEMP_INIT; tz->passive = 0; tz->prev_low_trip = -INT_MAX; tz->prev_high_trip = INT_MAX; list_for_each_entry(pos, &tz->thermal_instances, tz_node) pos->initialized = false; } static void thermal_governor_trip_crossed(struct thermal_governor *governor, struct thermal_zone_device *tz, const struct thermal_trip *trip, bool crossed_up) { if (trip->type == THERMAL_TRIP_HOT || trip->type == THERMAL_TRIP_CRITICAL) return; if (governor->trip_crossed) governor->trip_crossed(tz, trip, crossed_up); } static void thermal_trip_crossed(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_governor *governor, bool crossed_up) { if (crossed_up) { thermal_notify_tz_trip_up(tz, trip); thermal_debug_tz_trip_up(tz, trip); } else { thermal_notify_tz_trip_down(tz, trip); thermal_debug_tz_trip_down(tz, trip); } thermal_governor_trip_crossed(governor, tz, trip, crossed_up); } static int thermal_trip_notify_cmp(void *not_used, const struct list_head *a, const struct list_head *b) { struct thermal_trip_desc *tda = container_of(a, struct thermal_trip_desc, notify_list_node); struct thermal_trip_desc *tdb = container_of(b, struct thermal_trip_desc, notify_list_node); return tda->notify_temp - tdb->notify_temp; } void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { struct thermal_governor *governor = thermal_get_tz_governor(tz); struct thermal_trip_desc *td; LIST_HEAD(way_down_list); LIST_HEAD(way_up_list); int temp, ret; if (tz->suspended) return; if (!thermal_zone_device_is_enabled(tz)) return; ret = __thermal_zone_get_temp(tz, &temp); if (ret) { thermal_zone_recheck(tz, ret); return; } else if (temp <= THERMAL_TEMP_INVALID) { /* * Special case: No valid temperature value is available, but * the zone owner does not want the core to do anything about * it. Continue regular zone polling if needed, so that this * function can be called again, but skip everything else. */ goto monitor; } tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; tz->last_temperature = tz->temperature; tz->temperature = temp; trace_thermal_temperature(tz); thermal_genl_sampling_temp(tz->id, temp); tz->notify_event = event; for_each_trip_desc(tz, td) handle_thermal_trip(tz, td, &way_up_list, &way_down_list); thermal_zone_set_trips(tz); list_sort(NULL, &way_up_list, thermal_trip_notify_cmp); list_for_each_entry(td, &way_up_list, notify_list_node) thermal_trip_crossed(tz, &td->trip, governor, true); list_sort(NULL, &way_down_list, thermal_trip_notify_cmp); list_for_each_entry_reverse(td, &way_down_list, notify_list_node) thermal_trip_crossed(tz, &td->trip, governor, false); if (governor->manage) governor->manage(tz); thermal_debug_update_trip_stats(tz); monitor: monitor_thermal_zone(tz); } static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { int ret; mutex_lock(&tz->lock); /* do nothing if mode isn't changing */ if (mode == tz->mode) { mutex_unlock(&tz->lock); return 0; } ret = __thermal_zone_device_set_mode(tz, mode); if (ret) { mutex_unlock(&tz->lock); return ret; } __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&tz->lock); if (mode == THERMAL_DEVICE_ENABLED) thermal_notify_tz_enable(tz); else thermal_notify_tz_disable(tz); return 0; } int thermal_zone_device_enable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_ENABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_enable); int thermal_zone_device_disable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_disable); int thermal_zone_device_is_enabled(struct thermal_zone_device *tz) { lockdep_assert_held(&tz->lock); return tz->mode == THERMAL_DEVICE_ENABLED; } static bool thermal_zone_is_present(struct thermal_zone_device *tz) { return !list_empty(&tz->node); } void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { mutex_lock(&tz->lock); if (thermal_zone_is_present(tz)) __thermal_zone_device_update(tz, event); mutex_unlock(&tz->lock); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); void thermal_zone_trip_down(struct thermal_zone_device *tz, const struct thermal_trip *trip) { thermal_trip_crossed(tz, trip, thermal_get_tz_governor(tz), false); } int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *data) { struct thermal_governor *gov; int ret = 0; mutex_lock(&thermal_governor_lock); list_for_each_entry(gov, &thermal_governor_list, governor_list) { ret = cb(gov, data); if (ret) break; } mutex_unlock(&thermal_governor_lock); return ret; } int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *, void *), void *data) { struct thermal_cooling_device *cdev; int ret = 0; mutex_lock(&thermal_list_lock); list_for_each_entry(cdev, &thermal_cdev_list, node) { ret = cb(cdev, data); if (ret) break; } mutex_unlock(&thermal_list_lock); return ret; } int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *), void *data) { struct thermal_zone_device *tz; int ret = 0; mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { ret = cb(tz, data); if (ret) break; } mutex_unlock(&thermal_list_lock); return ret; } struct thermal_zone_device *thermal_zone_get_by_id(int id) { struct thermal_zone_device *tz, *match = NULL; mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->id == id) { match = tz; break; } } mutex_unlock(&thermal_list_lock); return match; } /* * Device management section: cooling devices, zones devices, and binding * * Set of functions provided by the thermal core for: * - cooling devices lifecycle: registration, unregistration, * binding, and unbinding. * - thermal zone devices lifecycle: registration, unregistration, * binding, and unbinding. */ /** * thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone * @tz: pointer to struct thermal_zone_device * @trip: trip point the cooling devices is associated with in this zone. * @cdev: pointer to struct thermal_cooling_device * @upper: the Maximum cooling state for this trip point. * THERMAL_NO_LIMIT means no upper limit, * and the cooling device can be in max_state. * @lower: the Minimum cooling state can be used for this trip point. * THERMAL_NO_LIMIT means no lower limit, * and the cooling device can be in cooling state 0. * @weight: The weight of the cooling device to be bound to the * thermal zone. Use THERMAL_WEIGHT_DEFAULT for the * default value * * This interface function bind a thermal cooling device to the certain trip * point of a thermal zone device. * This function is usually called in the thermal zone device .bind callback. * * Return: 0 on success, the proper error value otherwise. */ int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev, unsigned long upper, unsigned long lower, unsigned int weight) { struct thermal_instance *dev; struct thermal_instance *pos; struct thermal_zone_device *pos1; struct thermal_cooling_device *pos2; bool upper_no_limit; int result; list_for_each_entry(pos1, &thermal_tz_list, node) { if (pos1 == tz) break; } list_for_each_entry(pos2, &thermal_cdev_list, node) { if (pos2 == cdev) break; } if (tz != pos1 || cdev != pos2) return -EINVAL; /* lower default 0, upper default max_state */ lower = lower == THERMAL_NO_LIMIT ? 0 : lower; if (upper == THERMAL_NO_LIMIT) { upper = cdev->max_state; upper_no_limit = true; } else { upper_no_limit = false; } if (lower > upper || upper > cdev->max_state) return -EINVAL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->tz = tz; dev->cdev = cdev; dev->trip = trip; dev->upper = upper; dev->upper_no_limit = upper_no_limit; dev->lower = lower; dev->target = THERMAL_NO_TARGET; dev->weight = weight; result = ida_alloc(&tz->ida, GFP_KERNEL); if (result < 0) goto free_mem; dev->id = result; sprintf(dev->name, "cdev%d", dev->id); result = sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name); if (result) goto release_ida; snprintf(dev->attr_name, sizeof(dev->attr_name), "cdev%d_trip_point", dev->id); sysfs_attr_init(&dev->attr.attr); dev->attr.attr.name = dev->attr_name; dev->attr.attr.mode = 0444; dev->attr.show = trip_point_show; result = device_create_file(&tz->device, &dev->attr); if (result) goto remove_symbol_link; snprintf(dev->weight_attr_name, sizeof(dev->weight_attr_name), "cdev%d_weight", dev->id); sysfs_attr_init(&dev->weight_attr.attr); dev->weight_attr.attr.name = dev->weight_attr_name; dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO; dev->weight_attr.show = weight_show; dev->weight_attr.store = weight_store; result = device_create_file(&tz->device, &dev->weight_attr); if (result) goto remove_trip_file; mutex_lock(&tz->lock); mutex_lock(&cdev->lock); list_for_each_entry(pos, &tz->thermal_instances, tz_node) if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) { result = -EEXIST; break; } if (!result) { list_add_tail(&dev->tz_node, &tz->thermal_instances); list_add_tail(&dev->cdev_node, &cdev->thermal_instances); atomic_set(&tz->need_update, 1); thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV); } mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); if (!result) return 0; device_remove_file(&tz->device, &dev->weight_attr); remove_trip_file: device_remove_file(&tz->device, &dev->attr); remove_symbol_link: sysfs_remove_link(&tz->device.kobj, dev->name); release_ida: ida_free(&tz->ida, dev->id); free_mem: kfree(dev); return result; } EXPORT_SYMBOL_GPL(thermal_bind_cdev_to_trip); int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, int trip_index, struct thermal_cooling_device *cdev, unsigned long upper, unsigned long lower, unsigned int weight) { if (trip_index < 0 || trip_index >= tz->num_trips) return -EINVAL; return thermal_bind_cdev_to_trip(tz, &tz->trips[trip_index].trip, cdev, upper, lower, weight); } EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device); /** * thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone. * @tz: pointer to a struct thermal_zone_device. * @trip: trip point the cooling devices is associated with in this zone. * @cdev: pointer to a struct thermal_cooling_device. * * This interface function unbind a thermal cooling device from the certain * trip point of a thermal zone device. * This function is usually called in the thermal zone device .unbind callback. * * Return: 0 on success, the proper error value otherwise. */ int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev) { struct thermal_instance *pos, *next; mutex_lock(&tz->lock); mutex_lock(&cdev->lock); list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) { if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) { list_del(&pos->tz_node); list_del(&pos->cdev_node); thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV); mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); goto unbind; } } mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); return -ENODEV; unbind: device_remove_file(&tz->device, &pos->weight_attr); device_remove_file(&tz->device, &pos->attr); sysfs_remove_link(&tz->device.kobj, pos->name); ida_free(&tz->ida, pos->id); kfree(pos); return 0; } EXPORT_SYMBOL_GPL(thermal_unbind_cdev_from_trip); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, int trip_index, struct thermal_cooling_device *cdev) { if (trip_index < 0 || trip_index >= tz->num_trips) return -EINVAL; return thermal_unbind_cdev_from_trip(tz, &tz->trips[trip_index].trip, cdev); } EXPORT_SYMBOL_GPL(thermal_zone_unbind_cooling_device); static void thermal_release(struct device *dev) { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; if (!strncmp(dev_name(dev), "thermal_zone", sizeof("thermal_zone") - 1)) { tz = to_thermal_zone(dev); thermal_zone_destroy_device_groups(tz); mutex_destroy(&tz->lock); complete(&tz->removal); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { cdev = to_cooling_device(dev); thermal_cooling_device_destroy_sysfs(cdev); kfree_const(cdev->type); ida_free(&thermal_cdev_ida, cdev->id); kfree(cdev); } } static struct class *thermal_class; static inline void print_bind_err_msg(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev, int ret) { dev_err(&tz->device, "binding zone %s with cdev %s failed:%d\n", tz->type, cdev->type, ret); } static void bind_cdev(struct thermal_cooling_device *cdev) { int ret; struct thermal_zone_device *pos = NULL; list_for_each_entry(pos, &thermal_tz_list, node) { if (pos->ops.bind) { ret = pos->ops.bind(pos, cdev); if (ret) print_bind_err_msg(pos, cdev, ret); } } } /** * __thermal_cooling_device_register() - register a new thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * It also gives the opportunity to link the cooling device to a device tree * node, so that it can be bound to a thermal zone created out of device tree. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ static struct thermal_cooling_device * __thermal_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos = NULL; unsigned long current_state; int id, ret; if (!ops || !ops->get_max_state || !ops->get_cur_state || !ops->set_cur_state) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) return ERR_PTR(-ENOMEM); ret = ida_alloc(&thermal_cdev_ida, GFP_KERNEL); if (ret < 0) goto out_kfree_cdev; cdev->id = ret; id = ret; cdev->type = kstrdup_const(type ? type : "", GFP_KERNEL); if (!cdev->type) { ret = -ENOMEM; goto out_ida_remove; } mutex_init(&cdev->lock); INIT_LIST_HEAD(&cdev->thermal_instances); cdev->np = np; cdev->ops = ops; cdev->updated = false; cdev->device.class = thermal_class; cdev->devdata = devdata; ret = cdev->ops->get_max_state(cdev, &cdev->max_state); if (ret) goto out_cdev_type; /* * The cooling device's current state is only needed for debug * initialization below, so a failure to get it does not cause * the entire cooling device initialization to fail. However, * the debug will not work for the device if its initial state * cannot be determined and drivers are responsible for ensuring * that this will not happen. */ ret = cdev->ops->get_cur_state(cdev, ¤t_state); if (ret) current_state = ULONG_MAX; thermal_cooling_device_setup_sysfs(cdev); ret = dev_set_name(&cdev->device, "cooling_device%d", cdev->id); if (ret) goto out_cooling_dev; ret = device_register(&cdev->device); if (ret) { /* thermal_release() handles rest of the cleanup */ put_device(&cdev->device); return ERR_PTR(ret); } if (current_state <= cdev->max_state) thermal_debug_cdev_add(cdev, current_state); /* Add 'this' new cdev to the global cdev list */ mutex_lock(&thermal_list_lock); list_add(&cdev->node, &thermal_cdev_list); /* Update binding information for 'this' new cdev */ bind_cdev(cdev); list_for_each_entry(pos, &thermal_tz_list, node) if (atomic_cmpxchg(&pos->need_update, 1, 0)) thermal_zone_device_update(pos, THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&thermal_list_lock); return cdev; out_cooling_dev: thermal_cooling_device_destroy_sysfs(cdev); out_cdev_type: kfree_const(cdev->type); out_ida_remove: ida_free(&thermal_cdev_ida, id); out_kfree_cdev: kfree(cdev); return ERR_PTR(ret); } /** * thermal_cooling_device_register() - register a new thermal cooling device * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(NULL, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_cooling_device_register); /** * thermal_of_cooling_device_register() - register an OF thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(np, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_of_cooling_device_register); static void thermal_cooling_device_release(struct device *dev, void *res) { thermal_cooling_device_unregister( *(struct thermal_cooling_device **)res); } /** * devm_thermal_of_cooling_device_register() - register an OF thermal cooling * device * @dev: a valid struct device pointer of a sensor device. * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device **ptr, *tcd; ptr = devres_alloc(thermal_cooling_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); tcd = __thermal_cooling_device_register(np, type, devdata, ops); if (IS_ERR(tcd)) { devres_free(ptr); return tcd; } *ptr = tcd; devres_add(dev, ptr); return tcd; } EXPORT_SYMBOL_GPL(devm_thermal_of_cooling_device_register); static bool thermal_cooling_device_present(struct thermal_cooling_device *cdev) { struct thermal_cooling_device *pos = NULL; list_for_each_entry(pos, &thermal_cdev_list, node) { if (pos == cdev) return true; } return false; } /** * thermal_cooling_device_update - Update a cooling device object * @cdev: Target cooling device. * * Update @cdev to reflect a change of the underlying hardware or platform. * * Must be called when the maximum cooling state of @cdev becomes invalid and so * its .get_max_state() callback needs to be run to produce the new maximum * cooling state value. */ void thermal_cooling_device_update(struct thermal_cooling_device *cdev) { struct thermal_instance *ti; unsigned long state; if (IS_ERR_OR_NULL(cdev)) return; /* * Hold thermal_list_lock throughout the update to prevent the device * from going away while being updated. */ mutex_lock(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) goto unlock_list; /* * Update under the cdev lock to prevent the state from being set beyond * the new limit concurrently. */ mutex_lock(&cdev->lock); if (cdev->ops->get_max_state(cdev, &cdev->max_state)) goto unlock; thermal_cooling_device_stats_reinit(cdev); list_for_each_entry(ti, &cdev->thermal_instances, cdev_node) { if (ti->upper == cdev->max_state) continue; if (ti->upper < cdev->max_state) { if (ti->upper_no_limit) ti->upper = cdev->max_state; continue; } ti->upper = cdev->max_state; if (ti->lower > ti->upper) ti->lower = ti->upper; if (ti->target == THERMAL_NO_TARGET) continue; if (ti->target > ti->upper) ti->target = ti->upper; } if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state) goto unlock; thermal_cooling_device_stats_update(cdev, state); unlock: mutex_unlock(&cdev->lock); unlock_list: mutex_unlock(&thermal_list_lock); } EXPORT_SYMBOL_GPL(thermal_cooling_device_update); /** * thermal_cooling_device_unregister - removes a thermal cooling device * @cdev: the thermal cooling device to remove. * * thermal_cooling_device_unregister() must be called when a registered * thermal cooling device is no longer needed. */ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; if (!cdev) return; thermal_debug_cdev_remove(cdev); mutex_lock(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) { mutex_unlock(&thermal_list_lock); return; } list_del(&cdev->node); /* Unbind all thermal zones associated with 'this' cdev */ list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->ops.unbind) tz->ops.unbind(tz, cdev); } mutex_unlock(&thermal_list_lock); device_unregister(&cdev->device); } EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister); static void bind_tz(struct thermal_zone_device *tz) { int ret; struct thermal_cooling_device *pos = NULL; if (!tz->ops.bind) return; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_cdev_list, node) { ret = tz->ops.bind(tz, pos); if (ret) print_bind_err_msg(tz, pos, ret); } mutex_unlock(&thermal_list_lock); } static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms) { *delay_jiffies = msecs_to_jiffies(delay_ms); if (delay_ms > 1000) *delay_jiffies = round_jiffies(*delay_jiffies); } int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp) { const struct thermal_trip_desc *td; int ret = -EINVAL; if (tz->ops.get_crit_temp) return tz->ops.get_crit_temp(tz, temp); mutex_lock(&tz->lock); for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (trip->type == THERMAL_TRIP_CRITICAL) { *temp = trip->temperature; ret = 0; break; } } mutex_unlock(&tz->lock); return ret; } EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp); /** * thermal_zone_device_register_with_trips() - register a new thermal zone device * @type: the thermal zone device type * @trips: a pointer to an array of thermal trips * @num_trips: the number of trip points the thermal zone support * @devdata: private device data * @ops: standard thermal zone device callbacks * @tzp: thermal zone platform parameters * @passive_delay: number of milliseconds to wait between polls when * performing passive cooling * @polling_delay: number of milliseconds to wait between polls when checking * whether trip points have been crossed (0 for interrupt * driven systems) * * This interface function adds a new thermal zone device (sensor) to * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the * thermal cooling devices registered at the same time. * thermal_zone_device_unregister() must be called when the device is no * longer needed. The passive cooling depends on the .get_trend() return value. * * Return: a pointer to the created struct thermal_zone_device or an * in case of error, an ERR_PTR. Caller must check return value with * IS_ERR*() helpers. */ struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *type, const struct thermal_trip *trips, int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, unsigned int passive_delay, unsigned int polling_delay) { const struct thermal_trip *trip = trips; struct thermal_zone_device *tz; struct thermal_trip_desc *td; int id; int result; struct thermal_governor *governor; if (!type || strlen(type) == 0) { pr_err("No thermal zone type defined\n"); return ERR_PTR(-EINVAL); } if (strlen(type) >= THERMAL_NAME_LENGTH) { pr_err("Thermal zone name (%s) too long, should be under %d chars\n", type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); } if (num_trips < 0) { pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } if (!ops || !ops->get_temp) { pr_err("Thermal zone device ops not defined\n"); return ERR_PTR(-EINVAL); } if (num_trips > 0 && !trips) return ERR_PTR(-EINVAL); if (polling_delay) { if (passive_delay > polling_delay) return ERR_PTR(-EINVAL); if (!passive_delay) passive_delay = polling_delay; } if (!thermal_class) return ERR_PTR(-ENODEV); tz = kzalloc(struct_size(tz, trips, num_trips), GFP_KERNEL); if (!tz) return ERR_PTR(-ENOMEM); if (tzp) { tz->tzp = kmemdup(tzp, sizeof(*tzp), GFP_KERNEL); if (!tz->tzp) { result = -ENOMEM; goto free_tz; } } INIT_LIST_HEAD(&tz->thermal_instances); INIT_LIST_HEAD(&tz->node); ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; goto free_tzp; } tz->id = id; strscpy(tz->type, type, sizeof(tz->type)); tz->ops = *ops; if (!tz->ops.critical) tz->ops.critical = thermal_zone_device_critical; tz->device.class = thermal_class; tz->devdata = devdata; tz->num_trips = num_trips; for_each_trip_desc(tz, td) { td->trip = *trip++; /* * Mark all thresholds as invalid to start with even though * this only matters for the trips that start as invalid and * become valid later. */ td->threshold = INT_MAX; } thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay); thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay); tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; /* sys I/F */ /* Add nodes that are always present via .groups */ result = thermal_zone_create_device_groups(tz); if (result) goto remove_id; /* A new thermal zone needs to be updated anyway. */ atomic_set(&tz->need_update, 1); result = dev_set_name(&tz->device, "thermal_zone%d", tz->id); if (result) { thermal_zone_destroy_device_groups(tz); goto remove_id; } result = device_register(&tz->device); if (result) goto release_device; /* Update 'this' zone's governor information */ mutex_lock(&thermal_governor_lock); if (tz->tzp) governor = __find_governor(tz->tzp->governor_name); else governor = def_governor; result = thermal_set_governor(tz, governor); if (result) { mutex_unlock(&thermal_governor_lock); goto unregister; } mutex_unlock(&thermal_governor_lock); if (!tz->tzp || !tz->tzp->no_hwmon) { result = thermal_add_hwmon_sysfs(tz); if (result) goto unregister; } mutex_lock(&thermal_list_lock); mutex_lock(&tz->lock); list_add_tail(&tz->node, &thermal_tz_list); mutex_unlock(&tz->lock); mutex_unlock(&thermal_list_lock); /* Bind cooling devices for this zone */ bind_tz(tz); thermal_zone_device_init(tz); /* Update the new thermal zone and mark it as already updated. */ if (atomic_cmpxchg(&tz->need_update, 1, 0)) thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); thermal_notify_tz_create(tz); thermal_debug_tz_add(tz); return tz; unregister: device_del(&tz->device); release_device: put_device(&tz->device); remove_id: ida_free(&thermal_tz_ida, id); free_tzp: kfree(tz->tzp); free_tz: kfree(tz); return ERR_PTR(result); } EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp) { return thermal_zone_device_register_with_trips(type, NULL, 0, devdata, ops, tzp, 0, 0); } EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register); void *thermal_zone_device_priv(struct thermal_zone_device *tzd) { return tzd->devdata; } EXPORT_SYMBOL_GPL(thermal_zone_device_priv); const char *thermal_zone_device_type(struct thermal_zone_device *tzd) { return tzd->type; } EXPORT_SYMBOL_GPL(thermal_zone_device_type); int thermal_zone_device_id(struct thermal_zone_device *tzd) { return tzd->id; } EXPORT_SYMBOL_GPL(thermal_zone_device_id); struct device *thermal_zone_device(struct thermal_zone_device *tzd) { return &tzd->device; } EXPORT_SYMBOL_GPL(thermal_zone_device); /** * thermal_zone_device_unregister - removes the registered thermal zone device * @tz: the thermal zone device to remove */ void thermal_zone_device_unregister(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos = NULL; if (!tz) return; thermal_debug_tz_remove(tz); mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (pos == tz) break; if (pos != tz) { /* thermal zone device not found */ mutex_unlock(&thermal_list_lock); return; } mutex_lock(&tz->lock); list_del(&tz->node); mutex_unlock(&tz->lock); /* Unbind all cdevs associated with 'this' thermal zone */ list_for_each_entry(cdev, &thermal_cdev_list, node) if (tz->ops.unbind) tz->ops.unbind(tz, cdev); mutex_unlock(&thermal_list_lock); cancel_delayed_work_sync(&tz->poll_queue); thermal_set_governor(tz, NULL); thermal_remove_hwmon_sysfs(tz); ida_free(&thermal_tz_ida, tz->id); ida_destroy(&tz->ida); device_del(&tz->device); kfree(tz->tzp); put_device(&tz->device); thermal_notify_tz_delete(tz); wait_for_completion(&tz->removal); kfree(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_unregister); /** * thermal_zone_get_zone_by_name() - search for a zone and returns its ref * @name: thermal zone name to fetch the temperature * * When only one zone is found with the passed name, returns a reference to it. * * Return: On success returns a reference to an unique thermal zone with * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid * paramenters, -ENODEV for not found and -EEXIST for multiple matches). */ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name) { struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL); unsigned int found = 0; if (!name) goto exit; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) { found++; ref = pos; } mutex_unlock(&thermal_list_lock); /* nothing has been found, thus an error code for it */ if (found == 0) ref = ERR_PTR(-ENODEV); else if (found > 1) /* Success only when an unique zone is found */ ref = ERR_PTR(-EEXIST); exit: return ref; } EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name); static void thermal_zone_device_resume(struct work_struct *work) { struct thermal_zone_device *tz; tz = container_of(work, struct thermal_zone_device, poll_queue.work); mutex_lock(&tz->lock); tz->suspended = false; thermal_debug_tz_resume(tz); thermal_zone_device_init(tz); thermal_governor_update_tz(tz, THERMAL_TZ_RESUME); __thermal_zone_device_update(tz, THERMAL_TZ_RESUME); complete(&tz->resume); tz->resuming = false; mutex_unlock(&tz->lock); } static int thermal_pm_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { struct thermal_zone_device *tz; switch (mode) { case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: case PM_SUSPEND_PREPARE: mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); if (tz->resuming) { /* * thermal_zone_device_resume() queued up for * this zone has not acquired the lock yet, so * release it to let the function run and wait * util it has done the work. */ mutex_unlock(&tz->lock); wait_for_completion(&tz->resume); mutex_lock(&tz->lock); } tz->suspended = true; mutex_unlock(&tz->lock); } mutex_unlock(&thermal_list_lock); break; case PM_POST_HIBERNATION: case PM_POST_RESTORE: case PM_POST_SUSPEND: mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); cancel_delayed_work(&tz->poll_queue); reinit_completion(&tz->resume); tz->resuming = true; /* * Replace the work function with the resume one, which * will restore the original work function and schedule * the polling work if needed. */ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_resume); /* Queue up the work without a delay. */ mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, 0); mutex_unlock(&tz->lock); } mutex_unlock(&thermal_list_lock); break; default: break; } return 0; } static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, /* * Run at the lowest priority to avoid interference between the thermal * zone resume work items spawned by thermal_pm_notify() and the other * PM notifiers. */ .priority = INT_MIN, }; static int __init thermal_init(void) { int result; thermal_debug_init(); result = thermal_netlink_init(); if (result) goto error; result = thermal_register_governors(); if (result) goto unregister_netlink; thermal_class = kzalloc(sizeof(*thermal_class), GFP_KERNEL); if (!thermal_class) { result = -ENOMEM; goto unregister_governors; } thermal_class->name = "thermal"; thermal_class->dev_release = thermal_release; result = class_register(thermal_class); if (result) { kfree(thermal_class); thermal_class = NULL; goto unregister_governors; } result = register_pm_notifier(&thermal_pm_nb); if (result) pr_warn("Thermal: Can not register suspend notifier, return %d\n", result); return 0; unregister_governors: thermal_unregister_governors(); unregister_netlink: thermal_netlink_exit(); error: mutex_destroy(&thermal_list_lock); mutex_destroy(&thermal_governor_lock); return result; } postcore_initcall(thermal_init); |
19 3 1 2 1 13 1 12 16 794 776 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> #include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> #include <net/net_namespace.h> #include <net/tcp.h> static int ip_metrics_convert(struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; int remaining; nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { int type = nla_type(nla); u32 val; if (!type) continue; if (type > RTAX_MAX) { NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; } type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; } } else { if (nla_len(nla) != sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute in metrics"); return -EINVAL; } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; } metrics[type - 1] = val; } if (ecn_ca) metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; if (!fc_mx) return (struct dst_metrics *)&dst_default_metrics; fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { kfree(fib_metrics); fib_metrics = ERR_PTR(err); } return fib_metrics; } EXPORT_SYMBOL_GPL(ip_fib_metrics_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2016, Amir Vadai <amir@vadai.me> * Copyright (c) 2016, Mellanox Technologies. All rights reserved. */ #ifndef __NET_TC_TUNNEL_KEY_H #define __NET_TC_TUNNEL_KEY_H #include <net/act_api.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/dst_metadata.h> struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; struct metadata_dst *tcft_enc_metadata; }; struct tcf_tunnel_key { struct tc_action common; struct tcf_tunnel_key_params __rcu *params; }; #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a) static inline bool is_tcf_tunnel_set(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; #endif return false; } static inline bool is_tcf_tunnel_release(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; #endif return false; } static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else return NULL; #endif } static inline struct ip_tunnel_info * tcf_tunnel_info_copy(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct ip_tunnel_info *tun = tcf_tunnel_info(a); if (tun) { size_t tun_size = sizeof(*tun) + tun->options_len; struct ip_tunnel_info *tun_copy = kmemdup(tun, tun_size, GFP_ATOMIC); return tun_copy; } #endif return NULL; } #endif /* __NET_TC_TUNNEL_KEY_H */ |
9 2 8 9 99 106 13 1 1 11 7 22 19 4 17 19 7 9 19 12 2 17 10 18 14 107 17 92 31 26 27 15 17 14 14 12 2 102 2 4 27 32 78 99 17 3 14 1 99 77 31 111 105 14 14 8 15 23 106 8 28 90 17 57 3 5 21 3 26 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | /* * linux/fs/hfs/extent.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains the functions related to the extents B-tree. */ #include <linux/pagemap.h> #include "hfs_fs.h" #include "btree.h" /*================ File-local functions ================*/ /* * build_key */ static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type) { key->key_len = 7; key->ext.FkType = type; key->ext.FNum = cpu_to_be32(cnid); key->ext.FABN = cpu_to_be16(block); } /* * hfs_ext_compare() * * Description: * This is the comparison function used for the extents B-tree. In * comparing extent B-tree entries, the file id is the most * significant field (compared as unsigned ints); the fork type is * the second most significant field (compared as unsigned chars); * and the allocation block number field is the least significant * (compared as unsigned ints). * Input Variable(s): * struct hfs_ext_key *key1: pointer to the first key to compare * struct hfs_ext_key *key2: pointer to the second key to compare * Output Variable(s): * NONE * Returns: * int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2 * Preconditions: * key1 and key2 point to "valid" (struct hfs_ext_key)s. * Postconditions: * This function has no side-effects */ int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2) { __be32 fnum1, fnum2; __be16 block1, block2; fnum1 = key1->ext.FNum; fnum2 = key2->ext.FNum; if (fnum1 != fnum2) return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1; if (key1->ext.FkType != key2->ext.FkType) return key1->ext.FkType < key2->ext.FkType ? -1 : 1; block1 = key1->ext.FABN; block2 = key2->ext.FABN; if (block1 == block2) return 0; return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1; } /* * hfs_ext_find_block * * Find a block within an extent record */ static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) { int i; u16 count; for (i = 0; i < 3; ext++, i++) { count = be16_to_cpu(ext->count); if (off < count) return be16_to_cpu(ext->block) + off; off -= count; } /* panic? */ return 0; } static int hfs_ext_block_count(struct hfs_extent *ext) { int i; u16 count = 0; for (i = 0; i < 3; ext++, i++) count += be16_to_cpu(ext->count); return count; } static u16 hfs_ext_lastblock(struct hfs_extent *ext) { int i; ext += 2; for (i = 0; i < 2; ext--, i++) if (ext->count) break; return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); } static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { int res; hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); res = hfs_brec_find(fd); if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) return res; /* Fail early and avoid ENOSPC during the btree operation */ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); if (res) return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { if (res) return res; hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; } return 0; } int hfs_ext_write_extent(struct inode *inode) { struct hfs_find_data fd; int res = 0; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); if (res) return res; res = __hfs_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } return res; } static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, u32 cnid, u32 block, u8 type) { int res; hfs_ext_build_key(fd->search_key, cnid, block, type); fd->key->ext.FNum = 0; res = hfs_brec_find(fd); if (res && res != -ENOENT) return res; if (fd->key->ext.FNum != fd->search_key->ext.FNum || fd->key->ext.FkType != fd->search_key->ext.FkType) return -ENOENT; if (fd->entrylength != sizeof(hfs_extent_rec)) return -EIO; hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec)); return 0; } static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) { int res; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { res = __hfs_ext_write_extent(inode, fd); if (res) return res; } res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); if (!res) { HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN); HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents); } else { HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } return res; } static int hfs_ext_read_extent(struct inode *inode, u16 block) { struct hfs_find_data fd; int res; if (block >= HFS_I(inode)->cached_start && block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) return 0; res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); if (!res) { res = __hfs_ext_cache_extent(&fd, inode, block); hfs_find_exit(&fd); } return res; } static void hfs_dump_extent(struct hfs_extent *extent) { int i; hfs_dbg(EXTENT, " "); for (i = 0; i < 3; i++) hfs_dbg_cont(EXTENT, " %u:%u", be16_to_cpu(extent[i].block), be16_to_cpu(extent[i].count)); hfs_dbg_cont(EXTENT, "\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, u16 alloc_block, u16 block_count) { u16 count, start; int i; hfs_dump_extent(extent); for (i = 0; i < 3; extent++, i++) { count = be16_to_cpu(extent->count); if (offset == count) { start = be16_to_cpu(extent->block); if (alloc_block != start + count) { if (++i >= 3) return -ENOSPC; extent++; extent->block = cpu_to_be16(alloc_block); } else block_count += count; extent->count = cpu_to_be16(block_count); return 0; } else if (offset < count) break; offset -= count; } /* panic? */ return -EIO; } static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent, u16 offset, u16 block_nr) { u16 count, start; int i; hfs_dump_extent(extent); for (i = 0; i < 3; extent++, i++) { count = be16_to_cpu(extent->count); if (offset == count) goto found; else if (offset < count) break; offset -= count; } /* panic? */ return -EIO; found: for (;;) { start = be16_to_cpu(extent->block); if (count <= block_nr) { hfs_clear_vbm_bits(sb, start, count); extent->block = 0; extent->count = 0; block_nr -= count; } else { count -= block_nr; hfs_clear_vbm_bits(sb, start + count, block_nr); extent->count = cpu_to_be16(count); block_nr = 0; } if (!block_nr || !i) return 0; i--; extent--; count = be16_to_cpu(extent->count); } } int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) { struct hfs_find_data fd; u32 total_blocks, blocks, start; u32 cnid = be32_to_cpu(file->FlNum); struct hfs_extent *extent; int res, i; if (type == HFS_FK_DATA) { total_blocks = be32_to_cpu(file->PyLen); extent = file->ExtRec; } else { total_blocks = be32_to_cpu(file->RPyLen); extent = file->RExtRec; } total_blocks /= HFS_SB(sb)->alloc_blksz; if (!total_blocks) return 0; blocks = 0; for (i = 0; i < 3; i++) blocks += be16_to_cpu(extent[i].count); res = hfs_free_extents(sb, extent, blocks, blocks); if (res) return res; if (total_blocks == blocks) return 0; res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); if (res) return res; do { res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); if (res) break; start = be16_to_cpu(fd.key->ext.FABN); hfs_free_extents(sb, extent, total_blocks - start, total_blocks); hfs_brec_remove(&fd); total_blocks = start; } while (total_blocks > blocks); hfs_find_exit(&fd); return res; } /* * hfs_get_block */ int hfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { struct super_block *sb; u16 dblock, ablock; int res; sb = inode->i_sb; /* Convert inode block to disk allocation block */ ablock = (u32)block / HFS_SB(sb)->fs_div; if (block >= HFS_I(inode)->fs_blocks) { if (!create) return 0; if (block > HFS_I(inode)->fs_blocks) return -EIO; if (ablock >= HFS_I(inode)->alloc_blocks) { res = hfs_extend_file(inode); if (res) return res; } } else create = 0; if (ablock < HFS_I(inode)->first_blocks) { dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock); goto done; } mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_ext_read_extent(inode, ablock); if (!res) dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, ablock - HFS_I(inode)->cached_start); else { mutex_unlock(&HFS_I(inode)->extents_lock); return -EIO; } mutex_unlock(&HFS_I(inode)->extents_lock); done: map_bh(bh_result, sb, HFS_SB(sb)->fs_start + dblock * HFS_SB(sb)->fs_div + (u32)block % HFS_SB(sb)->fs_div); if (create) { set_buffer_new(bh_result); HFS_I(inode)->phys_size += sb->s_blocksize; HFS_I(inode)->fs_blocks++; inode_add_bytes(inode, sb->s_blocksize); mark_inode_dirty(inode); } return 0; } int hfs_extend_file(struct inode *inode) { struct super_block *sb = inode->i_sb; u32 start, len, goal; int res; mutex_lock(&HFS_I(inode)->extents_lock); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); else { res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks); if (res) goto out; goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents); } len = HFS_I(inode)->clump_blocks; start = hfs_vbm_search_free(sb, goal, &len); if (!len) { res = -ENOSPC; goto out; } hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); res = 0; } else { /* try to append to extents in inode */ res = hfs_add_extent(HFS_I(inode)->first_extents, HFS_I(inode)->alloc_blocks, start, len); if (res == -ENOSPC) goto insert_extent; } if (!res) { hfs_dump_extent(HFS_I(inode)->first_extents); HFS_I(inode)->first_blocks += len; } } else { res = hfs_add_extent(HFS_I(inode)->cached_extents, HFS_I(inode)->alloc_blocks - HFS_I(inode)->cached_start, start, len); if (!res) { hfs_dump_extent(HFS_I(inode)->cached_extents); HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; HFS_I(inode)->cached_blocks += len; } else if (res == -ENOSPC) goto insert_extent; } out: mutex_unlock(&HFS_I(inode)->extents_lock); if (!res) { HFS_I(inode)->alloc_blocks += len; mark_inode_dirty(inode); if (inode->i_ino < HFS_FIRSTUSER_CNID) set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags); set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } return res; insert_extent: hfs_dbg(EXTENT, "insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len); hfs_dump_extent(HFS_I(inode)->cached_extents); HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW; HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks; HFS_I(inode)->cached_blocks = len; res = 0; goto out; } void hfs_file_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; struct hfs_find_data fd; u16 blk_cnt, alloc_cnt, start; u32 size; int res; hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata = NULL; struct page *page; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; res = hfs_write_begin(NULL, mapping, size + 1, 0, &page, &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, page, fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; return; } else if (inode->i_size == HFS_I(inode)->phys_size) return; size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1; blk_cnt = size / HFS_SB(sb)->alloc_blksz; alloc_cnt = HFS_I(inode)->alloc_blocks; if (blk_cnt == alloc_cnt) goto out; mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); if (res) { mutex_unlock(&HFS_I(inode)->extents_lock); /* XXX: We lack error handling of hfs_file_truncate() */ return; } while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { hfs_free_extents(sb, HFS_I(inode)->first_extents, alloc_cnt, alloc_cnt - blk_cnt); hfs_dump_extent(HFS_I(inode)->first_extents); HFS_I(inode)->first_blocks = blk_cnt; break; } res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; start = HFS_I(inode)->cached_start; hfs_free_extents(sb, HFS_I(inode)->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfs_dump_extent(HFS_I(inode)->cached_extents); if (blk_cnt > start) { HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; break; } alloc_cnt = start; HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); hfs_brec_remove(&fd); } hfs_find_exit(&fd); mutex_unlock(&HFS_I(inode)->extents_lock); HFS_I(inode)->alloc_blocks = blk_cnt; out: HFS_I(inode)->phys_size = inode->i_size; HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); mark_inode_dirty(inode); } |
1993 1990 500 63 63 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_packets); u64_stats_add(&dstats->rx_bytes, len); u64_stats_update_end(&dstats->syncp); } static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; |