Total coverage: 334183 (18%)of 1920689
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 #ifndef _TCP_DCTCP_H #define _TCP_DCTCP_H static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); if (ce_state == 1) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; else tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } /* Minimal DCTP CE state machine: * * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, u32 *prior_rcv_nxt, u32 *ce_state) { u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; if (*ce_state != new_ce_state) { /* CE state has changed, force an immediate ACK to * reflect the new CE state. If an ACK was delayed, * send that first to reflect the prior CE state. */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); __tcp_send_ack(sk, *prior_rcv_nxt, 0); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; *ce_state = new_ce_state; dctcp_ece_ack_cwr(sk, new_ce_state); } #endif
460 459 460 459 460 43 44 26 29 445 423 384 443 446 2 423 440 259 258 458 459 354 52 92 12 1 444 11 379 267 1 350 87 4 1 1 85 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Search routines for btrees */ #include <linux/slab.h> #include "hfsplus_fs.h" int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; fd->tree = tree; fd->bnode = NULL; ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); return 0; } void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { __be32 cur_cnid; __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; search_cnid = fd->search_key->ext.cnid; } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { cur_cnid = fd->key->cat.parent; search_cnid = fd->search_key->cat.parent; } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; } else { cur_cnid = 0; /* used-uninitialized warning */ search_cnid = 0; BUG(); } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); if ((*begin) == (*end)) return 1; } else { if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) (*begin) = (*cur_rec) + 1; else (*end) = (*cur_rec) - 1; } return 0; } int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { int cmpval; cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { (*end) = (*cur_rec); return 1; } if (cmpval < 0) (*begin) = (*cur_rec) + 1; else *(end) = (*cur_rec) - 1; return 0; } /* Find the record in bnode that best matches key (not greater than...)*/ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found) { u16 off, len, keylen; int rec; int b, e; int res; BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; do { rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } } while (b <= e); if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); } done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; u32 nidx, parent; __be32 data; int height, res; fd->record = -1; fd->keyoffset = -1; fd->keylength = -1; fd->entryoffset = -1; fd->entrylength = -1; tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); fd->bnode = NULL; nidx = tree->root; if (!nidx) return -ENOENT; height = tree->depth; res = 0; parent = 0; for (;;) { bnode = hfs_bnode_find(tree, nidx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; break; } if (bnode->height != height) goto invalid; if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) goto invalid; bnode->parent = parent; res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) goto release; parent = nidx; hfs_bnode_read(bnode, &data, fd->entryoffset, 4); nidx = be32_to_cpu(data); hfs_bnode_put(bnode); } fd->bnode = bnode; return res; invalid: pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); return res; } int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len) { int res; res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) return -EINVAL; hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); return 0; } int hfs_brec_goto(struct hfs_find_data *fd, int cnt) { struct hfs_btree *tree; struct hfs_bnode *bnode; int idx, res = 0; u16 off, len, keylen; bnode = fd->bnode; tree = bnode->tree; if (cnt < 0) { cnt = -cnt; while (cnt > fd->record) { cnt -= fd->record + 1; fd->record = bnode->num_recs - 1; idx = bnode->prev; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record -= cnt; } else { while (cnt >= bnode->num_recs - fd->record) { cnt -= bnode->num_recs - fd->record; fd->record = 0; idx = bnode->next; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record += cnt; } len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); if (keylen == 0) { res = -EINVAL; goto out; } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; hfs_bnode_read(bnode, fd->key, off, keylen); out: fd->bnode = bnode; return res; }
27 27 6 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 /* * Copyright 2011 Red Hat, Inc. * Copyright © 2014 The Chromium OS Authors * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software") * to deal in the software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * them Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Adam Jackson <ajax@redhat.com> * Ben Widawsky <ben@bwidawsk.net> */ /* * This is vgem, a (non-hardware-backed) GEM service. This is used by Mesa's * software renderer and the X server for efficient buffer sharing. */ #include <linux/dma-buf.h> #include <linux/module.h> #include <linux/device/faux.h> #include <linux/shmem_fs.h> #include <linux/vmalloc.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> #include "vgem_drv.h" #define DRIVER_NAME "vgem" #define DRIVER_DESC "Virtual GEM provider" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vgem_device { struct drm_device drm; struct faux_device *faux_dev; } *vgem_device; static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; int ret; vfile = kzalloc_obj(*vfile); if (!vfile) return -ENOMEM; file->driver_priv = vfile; ret = vgem_fence_open(vfile); if (ret) { kfree(vfile); return ret; } return 0; } static void vgem_postclose(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile = file->driver_priv; vgem_fence_close(vfile); kfree(vfile); } static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), }; DEFINE_DRM_GEM_FOPS(vgem_driver_fops); static struct drm_gem_object *vgem_gem_create_object(struct drm_device *dev, size_t size) { struct drm_gem_shmem_object *obj; obj = kzalloc_obj(*obj); if (!obj) return ERR_PTR(-ENOMEM); /* * vgem doesn't have any begin/end cpu access ioctls, therefore must use * coherent memory or dma-buf sharing just wont work. */ obj->map_wc = true; return &obj->base; } static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, .open = vgem_open, .postclose = vgem_postclose, .ioctls = vgem_ioctls, .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .gem_create_object = vgem_gem_create_object, .name = DRIVER_NAME, .desc = DRIVER_DESC, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int __init vgem_init(void) { int ret; struct faux_device *fdev; fdev = faux_device_create("vgem", NULL, NULL); if (!fdev) return -ENODEV; if (!devres_open_group(&fdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } dma_coerce_mask_and_coherent(&fdev->dev, DMA_BIT_MASK(64)); vgem_device = devm_drm_dev_alloc(&fdev->dev, &vgem_driver, struct vgem_device, drm); if (IS_ERR(vgem_device)) { ret = PTR_ERR(vgem_device); goto out_devres; } vgem_device->faux_dev = fdev; /* Final step: expose the device/driver to userspace */ ret = drm_dev_register(&vgem_device->drm, 0); if (ret) goto out_devres; return 0; out_devres: devres_release_group(&fdev->dev, NULL); out_unregister: faux_device_destroy(fdev); return ret; } static void __exit vgem_exit(void) { struct faux_device *fdev = vgem_device->faux_dev; drm_dev_unregister(&vgem_device->drm); devres_release_group(&fdev->dev, NULL); faux_device_destroy(fdev); } module_init(vgem_init); module_exit(vgem_exit); MODULE_AUTHOR("Red Hat, Inc."); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights");
8 8 9 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * * Author: Artem Bityutskiy (Битюцкий Артём) */ /* Here we keep miscellaneous functions which are used all over the UBI code */ #include "ubi.h" /** * ubi_calc_data_len - calculate how much real data is stored in a buffer. * @ubi: UBI device description object * @buf: a buffer with the contents of the physical eraseblock * @length: the buffer length * * This function calculates how much "real data" is stored in @buf and returnes * the length. Continuous 0xFF bytes at the end of the buffer are not * considered as "real data". */ int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length) { int i; ubi_assert(!(length & (ubi->min_io_size - 1))); for (i = length - 1; i >= 0; i--) if (((const uint8_t *)buf)[i] != 0xFF) break; /* The resulting length must be aligned to the minimum flash I/O size */ length = ALIGN(i + 1, ubi->min_io_size); return length; } /** * ubi_check_volume - check the contents of a static volume. * @ubi: UBI device description object * @vol_id: ID of the volume to check * * This function checks if static volume @vol_id is corrupted by fully reading * it and checking data CRC. This function returns %0 if the volume is not * corrupted, %1 if it is corrupted and a negative error code in case of * failure. Dynamic volumes are not checked and zero is returned immediately. */ int ubi_check_volume(struct ubi_device *ubi, int vol_id) { void *buf; int err = 0, i; struct ubi_volume *vol = ubi->volumes[vol_id]; if (vol->vol_type != UBI_STATIC_VOLUME) return 0; buf = vmalloc(vol->usable_leb_size); if (!buf) return -ENOMEM; for (i = 0; i < vol->used_ebs; i++) { int size; cond_resched(); if (i == vol->used_ebs - 1) size = vol->last_eb_bytes; else size = vol->usable_leb_size; err = ubi_eba_read_leb(ubi, vol, i, buf, 0, size, 1); if (err) { if (mtd_is_eccerr(err)) err = 1; break; } } vfree(buf); return err; } /** * ubi_update_reserved - update bad eraseblock handling accounting data. * @ubi: UBI device description object * * This function calculates the gap between current number of PEBs reserved for * bad eraseblock handling and the required level of PEBs that must be * reserved, and if necessary, reserves more PEBs to fill that gap, according * to availability. Should be called with ubi->volumes_lock held. */ void ubi_update_reserved(struct ubi_device *ubi) { int need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; if (need <= 0 || ubi->avail_pebs == 0) return; need = min_t(int, need, ubi->avail_pebs); ubi->avail_pebs -= need; ubi->rsvd_pebs += need; ubi->beb_rsvd_pebs += need; ubi_msg(ubi, "reserved more %d PEBs for bad PEB handling", need); } /** * ubi_calculate_reserved - calculate how many PEBs must be reserved for bad * eraseblock handling. * @ubi: UBI device description object */ void ubi_calculate_reserved(struct ubi_device *ubi) { /* * Calculate the actual number of PEBs currently needed to be reserved * for future bad eraseblock handling. */ ubi->beb_rsvd_level = ubi->bad_peb_limit - ubi->bad_peb_count; if (ubi->beb_rsvd_level < 0) { ubi->beb_rsvd_level = 0; ubi_warn(ubi, "number of bad PEBs (%d) is above the expected limit (%d), not reserving any PEBs for bad PEB handling, will use available PEBs (if any)", ubi->bad_peb_count, ubi->bad_peb_limit); } } /** * ubi_check_pattern - check if buffer contains only a certain byte pattern. * @buf: buffer to check * @patt: the pattern to check * @size: buffer size in bytes * * This function returns %1 in there are only @patt bytes in @buf, and %0 if * something else was also found. */ int ubi_check_pattern(const void *buf, uint8_t patt, int size) { int i; for (i = 0; i < size; i++) if (((const uint8_t *)buf)[i] != patt) return 0; return 1; } /* Normal UBI messages */ void ubi_msg(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_notice(UBI_NAME_STR "%d: %pV\n", ubi->ubi_num, &vaf); va_end(args); } /* UBI warning messages */ void ubi_warn(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn(UBI_NAME_STR "%d warning: %ps: %pV\n", ubi->ubi_num, __builtin_return_address(0), &vaf); va_end(args); } /* UBI error messages */ void ubi_err(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err(UBI_NAME_STR "%d error: %ps: %pV\n", ubi->ubi_num, __builtin_return_address(0), &vaf); va_end(args); }
10 10 1 10 10 10 10 6 4 10 10 10 10 10 10 10 10 18 2 16 11 11 10 2 2 2 4 4 4 3 7 7 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_health.h" #include "xfs_trans.h" /* * Bulk Stat * ========= * * Use the inode walking functions to fill out struct xfs_bulkstat for every * allocated inode, then pass the stat information to some externally provided * iteration function. */ struct xfs_bstat_chunk { bulkstat_one_fmt_pf formatter; struct xfs_ibulk *breq; struct xfs_bulkstat *buf; }; static inline bool want_metadir_file( struct xfs_inode *ip, struct xfs_ibulk *breq) { return xfs_is_metadir_inode(ip) && (breq->flags & XFS_IBULK_METADIR); } /* * Fill out the bulkstat info for a single inode and report it somewhere. * * bc->breq->lastino is effectively the inode cursor as we walk through the * filesystem. Therefore, we update it any time we need to move the cursor * forward, regardless of whether or not we're sending any bstat information * back to userspace. If the inode is internal metadata or, has been freed * out from under us, we just simply keep going. * * However, if any other type of error happens we want to stop right where we * are so that userspace will call back with exact number of the bad inode and * we can send back an error code. * * Note that if the formatter tells us there's no space left in the buffer we * move the cursor forward and abort the walk. */ STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, struct mnt_idmap *idmap, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) { struct user_namespace *sb_userns = mp->m_super->s_user_ns; struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; struct xfs_bulkstat *buf = bc->buf; xfs_extnum_t nextents; int error = -EINVAL; vfsuid_t vfsuid; vfsgid_t vfsgid; error = xfs_iget(mp, tp, ino, (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), XFS_ILOCK_SHARED, &ip); if (error == -ENOENT || error == -EINVAL) goto out_advance; if (error) goto out; /* Reload the incore unlinked list to avoid failure in inodegc. */ if (xfs_inode_unlinked_incomplete(ip)) { error = xfs_inode_reload_unlinked_bucket(tp, ip); if (error) { xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_irele(ip); return error; } } ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); /* * If caller wants files from the metadata directories, push out the * bare minimum information for enabling scrub. */ if (want_metadir_file(ip, bc->breq)) { memset(buf, 0, sizeof(*buf)); buf->bs_ino = ino; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode & S_IFMT; xfs_bulkstat_health(ip, buf); buf->bs_version = XFS_BULKSTAT_VERSION_V5; xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); error = bc->formatter(bc->breq, buf); if (!error || error == -ECANCELED) goto out_advance; goto out; } /* If this is a private inode, don't leak its details to userspace. */ if (IS_PRIVATE(inode) || xfs_is_sb_inum(mp, ino)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); error = -EINVAL; goto out_advance; } /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; buf->bs_atime = inode_get_atime_sec(inode); buf->bs_atime_nsec = inode_get_atime_nsec(inode); buf->bs_mtime = inode_get_mtime_sec(inode); buf->bs_mtime_nsec = inode_get_mtime_nsec(inode); buf->bs_ctime = inode_get_ctime_sec(inode); buf->bs_ctime_nsec = inode_get_ctime_nsec(inode); buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = ip->i_extsize; nextents = xfs_ifork_nextents(&ip->i_df); if (!(bc->breq->flags & XFS_IBULK_NREXT64)) buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL); else buf->bs_extents64 = nextents; xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(&ip->i_af); buf->bs_forkoff = xfs_inode_fork_boff(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (xfs_has_v3inodes(mp)) { buf->bs_btime = ip->i_crtime.tv_sec; buf->bs_btime_nsec = ip->i_crtime.tv_nsec; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = ip->i_cowextsize; } switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; buf->bs_blocks = 0; break; case XFS_DINODE_FMT_LOCAL: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; buf->bs_blocks = 0; break; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: buf->bs_rdev = 0; buf->bs_blksize = mp->m_sb.sb_blocksize; buf->bs_blocks = ip->i_nblocks + ip->i_delayed_blks; break; } xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); error = bc->formatter(bc->breq, buf); if (error == -ECANCELED) goto out_advance; if (error) goto out; out_advance: /* * Advance the cursor to the inode that comes after the one we just * looked at. We want the caller to move along if the bulkstat * information was copied successfully; if we tried to grab the inode * but it's no longer allocated; or if it's internal metadata. */ bc->breq->startino = ino + 1; out: return error; } /* Bulkstat a single inode. */ int xfs_bulkstat_one( struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter) { struct xfs_bstat_chunk bc = { .formatter = formatter, .breq = breq, }; struct xfs_trans *tp; int error; if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; } ASSERT(breq->icount == 1); bc.buf = kzalloc_obj(struct xfs_bulkstat, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); kfree(bc.buf); /* * If we reported one inode to userspace then we abort because we hit * the end of the buffer. Don't leak that back to userspace. */ if (error == -ECANCELED) error = 0; return error; } static int xfs_bulkstat_iwalk( struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, void *data) { struct xfs_bstat_chunk *bc = data; int error; error = xfs_bulkstat_one_int(mp, bc->breq->idmap, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; return error; } /* * Check the incoming lastino parameter. * * We allow any inode value that could map to physical space inside the * filesystem because if there are no inodes there, bulkstat moves on to the * next chunk. In other words, the magic agino value of zero takes us to the * first chunk in the AG, and an agino value past the end of the AG takes us to * the first chunk in the next AG. * * Therefore we can end early if the requested inode is beyond the end of the * filesystem or doesn't map properly. */ static inline bool xfs_bulkstat_already_done( struct xfs_mount *mp, xfs_ino_t startino) { xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, startino); return agno >= mp->m_sb.sb_agcount || startino != XFS_AGINO_TO_INO(mp, agno, agino); } /* Return stat information in bulk (by-inode) for the filesystem. */ int xfs_bulkstat( struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter) { struct xfs_bstat_chunk bc = { .formatter = formatter, .breq = breq, }; struct xfs_trans *tp; int error; if (breq->idmap != &nop_mnt_idmap) { xfs_warn_ratelimited(breq->mp, "bulkstat not supported inside of idmapped mounts."); return -EINVAL; } if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; bc.buf = kzalloc_obj(struct xfs_bulkstat, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); kfree(bc.buf); /* * We found some inodes, so clear the error status and return them. * The lastino pointer will point directly at the inode that triggered * any error that occurred, so on the next call the error will be * triggered again and propagated to userspace as there will be no * formatted inodes in the buffer. */ if (breq->ocount > 0) error = 0; return error; } /* Convert bulkstat (v5) to bstat (v1). */ void xfs_bulkstat_to_bstat( struct xfs_mount *mp, struct xfs_bstat *bs1, const struct xfs_bulkstat *bstat) { /* memset is needed here because of padding holes in the structure. */ memset(bs1, 0, sizeof(struct xfs_bstat)); bs1->bs_ino = bstat->bs_ino; bs1->bs_mode = bstat->bs_mode; bs1->bs_nlink = bstat->bs_nlink; bs1->bs_uid = bstat->bs_uid; bs1->bs_gid = bstat->bs_gid; bs1->bs_rdev = bstat->bs_rdev; bs1->bs_blksize = bstat->bs_blksize; bs1->bs_size = bstat->bs_size; bs1->bs_atime.tv_sec = bstat->bs_atime; bs1->bs_mtime.tv_sec = bstat->bs_mtime; bs1->bs_ctime.tv_sec = bstat->bs_ctime; bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec; bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec; bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec; bs1->bs_blocks = bstat->bs_blocks; bs1->bs_xflags = bstat->bs_xflags; bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks); bs1->bs_extents = bstat->bs_extents; bs1->bs_gen = bstat->bs_gen; bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF; bs1->bs_forkoff = bstat->bs_forkoff; bs1->bs_projid_hi = bstat->bs_projectid >> 16; bs1->bs_sick = bstat->bs_sick; bs1->bs_checked = bstat->bs_checked; bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks); bs1->bs_dmevmask = 0; bs1->bs_dmstate = 0; bs1->bs_aextents = bstat->bs_aextents; } struct xfs_inumbers_chunk { inumbers_fmt_pf formatter; struct xfs_ibulk *breq; }; /* * INUMBERS * ======== * This is how we export inode btree records to userspace, so that XFS tools * can figure out where inodes are allocated. */ /* * Format the inode group structure and report it somewhere. * * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk * through the filesystem so we move it forward unless there was a runtime * error. If the formatter tells us the buffer is now full we also move the * cursor forward and abort the walk. */ STATIC int xfs_inumbers_walk( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, const struct xfs_inobt_rec_incore *irec, void *data) { struct xfs_inumbers inogrp = { .xi_startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino), .xi_alloccount = irec->ir_count - irec->ir_freecount, .xi_allocmask = ~irec->ir_free, .xi_version = XFS_INUMBERS_VERSION_V5, }; struct xfs_inumbers_chunk *ic = data; int error; error = ic->formatter(ic->breq, &inogrp); if (error && error != -ECANCELED) return error; ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + XFS_INODES_PER_CHUNK; return error; } /* * Return inode number table for the filesystem. */ int xfs_inumbers( struct xfs_ibulk *breq, inumbers_fmt_pf formatter) { struct xfs_inumbers_chunk ic = { .formatter = formatter, .breq = breq, }; struct xfs_trans *tp; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); /* * We found some inode groups, so clear the error status and return * them. The lastino pointer will point directly at the inode that * triggered any error that occurred, so on the next call the error * will be triggered again and propagated to userspace as there will be * no formatted inode groups in the buffer. */ if (breq->ocount > 0) error = 0; return error; } /* Convert an inumbers (v5) struct to a inogrp (v1) struct. */ void xfs_inumbers_to_inogrp( struct xfs_inogrp *ig1, const struct xfs_inumbers *ig) { /* memset is needed here because of padding holes in the structure. */ memset(ig1, 0, sizeof(struct xfs_inogrp)); ig1->xi_startino = ig->xi_startino; ig1->xi_alloccount = ig->xi_alloccount; ig1->xi_allocmask = ig->xi_allocmask; }
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* FS-Cache tracepoints * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fscache #if !defined(_TRACE_FSCACHE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FSCACHE_H #include <linux/fscache.h> #include <linux/tracepoint.h> /* * Define enums for tracing information. */ #ifndef __FSCACHE_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __FSCACHE_DECLARE_TRACE_ENUMS_ONCE_ONLY enum fscache_cache_trace { fscache_cache_collision, fscache_cache_get_acquire, fscache_cache_new_acquire, fscache_cache_put_alloc_volume, fscache_cache_put_cache, fscache_cache_put_prep_failed, fscache_cache_put_relinquish, fscache_cache_put_volume, }; enum fscache_volume_trace { fscache_volume_collision, fscache_volume_get_cookie, fscache_volume_get_create_work, fscache_volume_get_hash_collision, fscache_volume_get_withdraw, fscache_volume_free, fscache_volume_new_acquire, fscache_volume_put_cookie, fscache_volume_put_create_work, fscache_volume_put_hash_collision, fscache_volume_put_relinquish, fscache_volume_put_withdraw, fscache_volume_see_create_work, fscache_volume_see_hash_wake, fscache_volume_wait_create_work, }; enum fscache_cookie_trace { fscache_cookie_collision, fscache_cookie_discard, fscache_cookie_failed, fscache_cookie_get_attach_object, fscache_cookie_get_end_access, fscache_cookie_get_hash_collision, fscache_cookie_get_inval_work, fscache_cookie_get_lru, fscache_cookie_get_use_work, fscache_cookie_new_acquire, fscache_cookie_put_hash_collision, fscache_cookie_put_lru, fscache_cookie_put_object, fscache_cookie_put_over_queued, fscache_cookie_put_relinquish, fscache_cookie_put_withdrawn, fscache_cookie_put_work, fscache_cookie_see_active, fscache_cookie_see_lru_discard, fscache_cookie_see_lru_discard_clear, fscache_cookie_see_lru_do_one, fscache_cookie_see_relinquish, fscache_cookie_see_withdraw, fscache_cookie_see_work, }; enum fscache_active_trace { fscache_active_use, fscache_active_use_modify, fscache_active_unuse, }; enum fscache_access_trace { fscache_access_acquire_volume, fscache_access_acquire_volume_end, fscache_access_cache_pin, fscache_access_cache_unpin, fscache_access_invalidate_cookie, fscache_access_invalidate_cookie_end, fscache_access_io_end, fscache_access_io_not_live, fscache_access_io_read, fscache_access_io_resize, fscache_access_io_wait, fscache_access_io_write, fscache_access_lookup_cookie, fscache_access_lookup_cookie_end, fscache_access_lookup_cookie_end_failed, fscache_access_relinquish_volume, fscache_access_relinquish_volume_end, fscache_access_unlive, }; #endif /* * Declare tracing information enums and their string mappings for display. */ #define fscache_cache_traces \ EM(fscache_cache_collision, "*COLLIDE*") \ EM(fscache_cache_get_acquire, "GET acq ") \ EM(fscache_cache_new_acquire, "NEW acq ") \ EM(fscache_cache_put_alloc_volume, "PUT alvol") \ EM(fscache_cache_put_cache, "PUT cache") \ EM(fscache_cache_put_prep_failed, "PUT pfail") \ EM(fscache_cache_put_relinquish, "PUT relnq") \ E_(fscache_cache_put_volume, "PUT vol ") #define fscache_volume_traces \ EM(fscache_volume_collision, "*COLLIDE*") \ EM(fscache_volume_get_cookie, "GET cook ") \ EM(fscache_volume_get_create_work, "GET creat") \ EM(fscache_volume_get_hash_collision, "GET hcoll") \ EM(fscache_volume_get_withdraw, "GET withd") \ EM(fscache_volume_free, "FREE ") \ EM(fscache_volume_new_acquire, "NEW acq ") \ EM(fscache_volume_put_cookie, "PUT cook ") \ EM(fscache_volume_put_create_work, "PUT creat") \ EM(fscache_volume_put_hash_collision, "PUT hcoll") \ EM(fscache_volume_put_relinquish, "PUT relnq") \ EM(fscache_volume_put_withdraw, "PUT withd") \ EM(fscache_volume_see_create_work, "SEE creat") \ EM(fscache_volume_see_hash_wake, "SEE hwake") \ E_(fscache_volume_wait_create_work, "WAIT crea") #define fscache_cookie_traces \ EM(fscache_cookie_collision, "*COLLIDE*") \ EM(fscache_cookie_discard, "DISCARD ") \ EM(fscache_cookie_failed, "FAILED ") \ EM(fscache_cookie_get_attach_object, "GET attch") \ EM(fscache_cookie_get_hash_collision, "GET hcoll") \ EM(fscache_cookie_get_end_access, "GQ endac") \ EM(fscache_cookie_get_inval_work, "GQ inval") \ EM(fscache_cookie_get_lru, "GET lru ") \ EM(fscache_cookie_get_use_work, "GQ use ") \ EM(fscache_cookie_new_acquire, "NEW acq ") \ EM(fscache_cookie_put_hash_collision, "PUT hcoll") \ EM(fscache_cookie_put_lru, "PUT lru ") \ EM(fscache_cookie_put_object, "PUT obj ") \ EM(fscache_cookie_put_over_queued, "PQ overq") \ EM(fscache_cookie_put_relinquish, "PUT relnq") \ EM(fscache_cookie_put_withdrawn, "PUT wthdn") \ EM(fscache_cookie_put_work, "PQ work ") \ EM(fscache_cookie_see_active, "- activ") \ EM(fscache_cookie_see_lru_discard, "- x-lru") \ EM(fscache_cookie_see_lru_discard_clear,"- lrudc") \ EM(fscache_cookie_see_lru_do_one, "- lrudo") \ EM(fscache_cookie_see_relinquish, "- x-rlq") \ EM(fscache_cookie_see_withdraw, "- x-wth") \ E_(fscache_cookie_see_work, "- work ") #define fscache_active_traces \ EM(fscache_active_use, "USE ") \ EM(fscache_active_use_modify, "USE-m ") \ E_(fscache_active_unuse, "UNUSE ") #define fscache_access_traces \ EM(fscache_access_acquire_volume, "BEGIN acq_vol") \ EM(fscache_access_acquire_volume_end, "END acq_vol") \ EM(fscache_access_cache_pin, "PIN cache ") \ EM(fscache_access_cache_unpin, "UNPIN cache ") \ EM(fscache_access_invalidate_cookie, "BEGIN inval ") \ EM(fscache_access_invalidate_cookie_end,"END inval ") \ EM(fscache_access_io_end, "END io ") \ EM(fscache_access_io_not_live, "END io_notl") \ EM(fscache_access_io_read, "BEGIN io_read") \ EM(fscache_access_io_resize, "BEGIN io_resz") \ EM(fscache_access_io_wait, "WAIT io ") \ EM(fscache_access_io_write, "BEGIN io_writ") \ EM(fscache_access_lookup_cookie, "BEGIN lookup ") \ EM(fscache_access_lookup_cookie_end, "END lookup ") \ EM(fscache_access_lookup_cookie_end_failed,"END lookupf") \ EM(fscache_access_relinquish_volume, "BEGIN rlq_vol") \ EM(fscache_access_relinquish_volume_end,"END rlq_vol") \ E_(fscache_access_unlive, "END unlive ") /* * Export enum symbols via userspace. */ #undef EM #undef E_ #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); fscache_cache_traces; fscache_volume_traces; fscache_cookie_traces; fscache_access_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that * will be printed in the output. */ #undef EM #undef E_ #define EM(a, b) { a, b }, #define E_(a, b) { a, b } TRACE_EVENT(fscache_cache, TP_PROTO(unsigned int cache_debug_id, int usage, enum fscache_cache_trace where), TP_ARGS(cache_debug_id, usage, where), TP_STRUCT__entry( __field(unsigned int, cache ) __field(int, usage ) __field(enum fscache_cache_trace, where ) ), TP_fast_assign( __entry->cache = cache_debug_id; __entry->usage = usage; __entry->where = where; ), TP_printk("C=%08x %s r=%d", __entry->cache, __print_symbolic(__entry->where, fscache_cache_traces), __entry->usage) ); TRACE_EVENT(fscache_volume, TP_PROTO(unsigned int volume_debug_id, int usage, enum fscache_volume_trace where), TP_ARGS(volume_debug_id, usage, where), TP_STRUCT__entry( __field(unsigned int, volume ) __field(int, usage ) __field(enum fscache_volume_trace, where ) ), TP_fast_assign( __entry->volume = volume_debug_id; __entry->usage = usage; __entry->where = where; ), TP_printk("V=%08x %s u=%d", __entry->volume, __print_symbolic(__entry->where, fscache_volume_traces), __entry->usage) ); TRACE_EVENT(fscache_cookie, TP_PROTO(unsigned int cookie_debug_id, int ref, enum fscache_cookie_trace where), TP_ARGS(cookie_debug_id, ref, where), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(int, ref ) __field(enum fscache_cookie_trace, where ) ), TP_fast_assign( __entry->cookie = cookie_debug_id; __entry->ref = ref; __entry->where = where; ), TP_printk("c=%08x %s r=%d", __entry->cookie, __print_symbolic(__entry->where, fscache_cookie_traces), __entry->ref) ); TRACE_EVENT(fscache_active, TP_PROTO(unsigned int cookie_debug_id, int ref, int n_active, int n_accesses, enum fscache_active_trace why), TP_ARGS(cookie_debug_id, ref, n_active, n_accesses, why), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(int, ref ) __field(int, n_active ) __field(int, n_accesses ) __field(enum fscache_active_trace, why ) ), TP_fast_assign( __entry->cookie = cookie_debug_id; __entry->ref = ref; __entry->n_active = n_active; __entry->n_accesses = n_accesses; __entry->why = why; ), TP_printk("c=%08x %s r=%d a=%d c=%d", __entry->cookie, __print_symbolic(__entry->why, fscache_active_traces), __entry->ref, __entry->n_accesses, __entry->n_active) ); TRACE_EVENT(fscache_access_cache, TP_PROTO(unsigned int cache_debug_id, int ref, int n_accesses, enum fscache_access_trace why), TP_ARGS(cache_debug_id, ref, n_accesses, why), TP_STRUCT__entry( __field(unsigned int, cache ) __field(int, ref ) __field(int, n_accesses ) __field(enum fscache_access_trace, why ) ), TP_fast_assign( __entry->cache = cache_debug_id; __entry->ref = ref; __entry->n_accesses = n_accesses; __entry->why = why; ), TP_printk("C=%08x %s r=%d a=%d", __entry->cache, __print_symbolic(__entry->why, fscache_access_traces), __entry->ref, __entry->n_accesses) ); TRACE_EVENT(fscache_access_volume, TP_PROTO(unsigned int volume_debug_id, unsigned int cookie_debug_id, int ref, int n_accesses, enum fscache_access_trace why), TP_ARGS(volume_debug_id, cookie_debug_id, ref, n_accesses, why), TP_STRUCT__entry( __field(unsigned int, volume ) __field(unsigned int, cookie ) __field(int, ref ) __field(int, n_accesses ) __field(enum fscache_access_trace, why ) ), TP_fast_assign( __entry->volume = volume_debug_id; __entry->cookie = cookie_debug_id; __entry->ref = ref; __entry->n_accesses = n_accesses; __entry->why = why; ), TP_printk("V=%08x c=%08x %s r=%d a=%d", __entry->volume, __entry->cookie, __print_symbolic(__entry->why, fscache_access_traces), __entry->ref, __entry->n_accesses) ); TRACE_EVENT(fscache_access, TP_PROTO(unsigned int cookie_debug_id, int ref, int n_accesses, enum fscache_access_trace why), TP_ARGS(cookie_debug_id, ref, n_accesses, why), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(int, ref ) __field(int, n_accesses ) __field(enum fscache_access_trace, why ) ), TP_fast_assign( __entry->cookie = cookie_debug_id; __entry->ref = ref; __entry->n_accesses = n_accesses; __entry->why = why; ), TP_printk("c=%08x %s r=%d a=%d", __entry->cookie, __print_symbolic(__entry->why, fscache_access_traces), __entry->ref, __entry->n_accesses) ); TRACE_EVENT(fscache_acquire, TP_PROTO(struct fscache_cookie *cookie), TP_ARGS(cookie), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(unsigned int, volume ) __field(int, v_ref ) __field(int, v_n_cookies ) ), TP_fast_assign( __entry->cookie = cookie->debug_id; __entry->volume = cookie->volume->debug_id; __entry->v_ref = refcount_read(&cookie->volume->ref); __entry->v_n_cookies = atomic_read(&cookie->volume->n_cookies); ), TP_printk("c=%08x V=%08x vr=%d vc=%d", __entry->cookie, __entry->volume, __entry->v_ref, __entry->v_n_cookies) ); TRACE_EVENT(fscache_relinquish, TP_PROTO(struct fscache_cookie *cookie, bool retire), TP_ARGS(cookie, retire), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(unsigned int, volume ) __field(int, ref ) __field(int, n_active ) __field(u8, flags ) __field(bool, retire ) ), TP_fast_assign( __entry->cookie = cookie->debug_id; __entry->volume = cookie->volume->debug_id; __entry->ref = refcount_read(&cookie->ref); __entry->n_active = atomic_read(&cookie->n_active); __entry->flags = cookie->flags; __entry->retire = retire; ), TP_printk("c=%08x V=%08x r=%d U=%d f=%02x rt=%u", __entry->cookie, __entry->volume, __entry->ref, __entry->n_active, __entry->flags, __entry->retire) ); TRACE_EVENT(fscache_invalidate, TP_PROTO(struct fscache_cookie *cookie, loff_t new_size), TP_ARGS(cookie, new_size), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(loff_t, new_size ) ), TP_fast_assign( __entry->cookie = cookie->debug_id; __entry->new_size = new_size; ), TP_printk("c=%08x sz=%llx", __entry->cookie, __entry->new_size) ); TRACE_EVENT(fscache_resize, TP_PROTO(struct fscache_cookie *cookie, loff_t new_size), TP_ARGS(cookie, new_size), TP_STRUCT__entry( __field(unsigned int, cookie ) __field(loff_t, old_size ) __field(loff_t, new_size ) ), TP_fast_assign( __entry->cookie = cookie->debug_id; __entry->old_size = cookie->object_size; __entry->new_size = new_size; ), TP_printk("c=%08x os=%08llx sz=%08llx", __entry->cookie, __entry->old_size, __entry->new_size) ); #endif /* _TRACE_FSCACHE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
11 15 4 10 2 4 4 4 3 4 2 2 1 1 1 3 3 4 4 15 4 11 14 1 4 15 7 2 1 3 1 2 4 6 8 6 2 3 1 4 3 3 3 7 28 28 28 28 2 27 16 6 4 2 2 1 2 17 6 12 1 18 1 17 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 // SPDX-License-Identifier: GPL-2.0 #include <linux/fsverity.h> #include <linux/iomap.h> #include "ctree.h" #include "delalloc-space.h" #include "direct-io.h" #include "extent-tree.h" #include "file.h" #include "fs.h" #include "transaction.h" #include "volumes.h" #include "bio.h" #include "ordered-data.h" struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; struct btrfs_ordered_extent *ordered; bool data_space_reserved; bool nocow_done; }; struct btrfs_dio_private { /* Range of I/O */ u64 file_offset; u32 bytes; /* This must be last */ struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, unsigned int iomap_flags) { const bool writing = (iomap_flags & IOMAP_WRITE); const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; /* Direct lock must be taken before the extent lock. */ if (nowait) { if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); } while (1) { if (nowait) { if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, cached_state)) { ret = -EAGAIN; break; } } else { btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, lockend - lockstart + 1); /* * We need to make sure there are no buffered pages in this * range either, we could have raced between the invalidate in * generic_file_direct_write and locking the extent. The * invalidate needs to happen so that reads after a write do not * get stale data. */ if (!ordered && (!writing || !filemap_range_has_page(inode->i_mapping, lockstart, lockend))) break; btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { btrfs_put_ordered_extent(ordered); ret = -EAGAIN; break; } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it * to complete and retry, because if we do so we can * deadlock with concurrent buffered writes on page * locks. This happens only if our DIO read covers more * than one extent map, if at this point has already * created an ordered extent for a previous extent map * and locked its range in the inode's io tree, and a * concurrent write against that previous extent map's * range and this range started (we unlock the ranges * in the io tree only when the bios complete and * buffered writes always lock pages before attempting * to lock range in the io tree). */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* * We could trigger writeback for this range (and wait * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) break; cond_resched(); } if (ret) btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, struct btrfs_dio_data *dio_data, const u64 start, const struct btrfs_file_extent *file_extent, const int type) { struct extent_map *em = NULL; struct btrfs_ordered_extent *ordered; if (type != BTRFS_ORDERED_NOCOW) { em = btrfs_create_io_em(inode, start, file_extent, type); if (IS_ERR(em)) goto out; } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, (1U << type) | (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { btrfs_free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + file_extent->num_bytes - 1, false); } em = ERR_CAST(ordered); } else { ASSERT(!dio_data->ordered); dio_data->ordered = ordered; } out: return em; } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_file_extent file_extent; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, true, true); if (ret == -EAGAIN) { ASSERT(btrfs_is_zoned(fs_info)); wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, TASK_UNINTERRUPTIBLE); goto again; } if (ret) return ERR_PTR(ret); file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; file_extent.ram_bytes = ins.offset; file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); return em; } static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 *lenp, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_file_extent file_extent; struct extent_map *em = *map; int type; u64 block_start; struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; u64 len = *lenp; u64 prev_len; int ret = 0; /* * We don't allocate a new extent in the following cases * * 1) The inode is marked as NODATACOW. In this case we'll just use the * existing extent. * 2) The extent is marked as PREALLOC. We're good to go here and can * just use the extent. * */ if ((em->flags & EXTENT_FLAG_PREALLOC) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->disk_bytenr != EXTENT_MAP_HOLE)) { if (em->flags & EXTENT_FLAG_PREALLOC) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); block_start = btrfs_extent_map_block_start(em) + (start - em->start); if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; } } prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ btrfs_free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) ret = -EAGAIN; goto out; } space_reserved = true; em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, &file_extent, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { btrfs_free_extent_map(em); *map = em2; em = em2; } if (IS_ERR(em2)) { ret = PTR_ERR(em2); goto out; } dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ btrfs_free_extent_map(em); *map = NULL; if (nowait) { ret = -EAGAIN; goto out; } /* * If we could not allocate data space before locking the file * range and we can't do a NOCOW write, then we have to fail. */ if (!dio_data->data_space_reserved) { ret = -ENOSPC; goto out; } /* * We have to COW and we have already reserved data space before, * so now we reserve only metadata. */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, false); if (ret < 0) goto out; space_reserved = true; em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) btrfs_delalloc_release_metadata(BTRFS_I(inode), prev_len - len, true); } /* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } *lenp = len; return ret; } static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; const u64 data_alloc_len = length; u32 unlock_bits = EXTENT_LOCKED; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if * we're NOWAIT we may submit a bio for a partial range and return * EIOCBQUEUED, which would result in an errant short read. * * The best way to handle this would be to allow for partial completions * of iocb's, so we could submit the partial bio, return and fault in * the rest of the pages, and then submit the io for the rest of the * range. However we don't have that currently, so simply return * -EAGAIN at this point so that the normal path is used. */ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) return -EAGAIN; /* * Cap the size of reads to that usually seen in buffered I/O as we need * to allocate a contiguous array for the checksums. */ if (!write) len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS); lockstart = start; lockend = start + len - 1; /* * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't * enough if we've written compressed pages to this area, so we need to * flush the dirty pages again to make absolutely sure that any * outstanding dirty pages are on disk - the first flush only starts * compression on the data, while keeping the pages locked, so by the * time the second flush returns we know bios for the compressed pages * were submitted and finished, and the pages no longer under writeback. * * If we have a NOWAIT request and we have any pages in the range that * are locked, likely due to compression still in progress, we don't want * to block on page locks. We also don't want to block on pages marked as * dirty or under writeback (same as for the non-compression case). * iomap_dio_rw() did the same check, but after that and before we got * here, mmap'ed writes may have happened or buffered reads started * (readpage() and readahead(), which lock pages), as we haven't locked * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { if (flags & IOMAP_NOWAIT) { if (filemap_range_needs_writeback(inode->i_mapping, lockstart, lockend)) return -EAGAIN; } else { ret = filemap_fdatawrite_range(inode->i_mapping, start, start + length - 1); if (ret) return ret; } } memset(dio_data, 0, sizeof(*dio_data)); /* * We always try to allocate data space and must do it before locking * the file range, to avoid deadlocks with concurrent writes to the same * range if the range has several extents and the writes don't expand the * current i_size (the inode lock is taken in shared mode). If we fail to * allocate data space here we continue and later, after locking the * file range, we fail with ENOSPC only if we figure out we can not do a * NOCOW write. */ if (write && !(flags & IOMAP_NOWAIT)) { ret = btrfs_check_data_free_space(BTRFS_I(inode), &dio_data->data_reserved, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; else if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered IO, or we are doing a * NOWAIT read/write and we need to block. */ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); if (ret < 0) goto err; em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; } /* * Ok for INLINE and COMPRESSED extents we need to fallback on buffered * io. INLINE is special, and we could probably kludge it in here, but * it's still buffered so for safety lets just fall back to the generic * buffered path. * * For COMPRESSED we _have_ to read the entire extent in so we can * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can * block with buffered IO (no support for NOWAIT semantics at * the moment) but also to avoid returning short reads to user * space - this happens if we were able to read some data from * previous non-compressed extents and then when we fallback to * buffered IO, at btrfs_file_read_iter() by calling * filemap_read(), we fail to fault in pages for the read buffer, * in which case filemap_read() returns a short read (the number * of bytes previously read is > 0, so it does not return -EFAULT). */ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } len = min(len, em->len - (start - em->start)); /* * If we have a NOWAIT request and the range contains multiple extents * (or a mix of extents and holes), then we return -EAGAIN to make the * caller fallback to a context where it can do a blocking (without * NOWAIT) request. This way we avoid doing partial IO and returning * success to the caller, which is not optimal for writes and for reads * it can result in unexpected behaviour for an application. * * When doing a read, because we use IOMAP_DIO_PARTIAL when calling * iomap_dio_rw(), we can end up returning less data then what the caller * asked for, resulting in an unexpected, and incorrect, short read. * That is, the caller asked to read N bytes and we return less than that, * which is wrong unless we are crossing EOF. This happens if we get a * page fault error when trying to fault in pages for the buffer that is * associated to the struct iov_iter passed to iomap_dio_rw(), and we * have previously submitted bios for other extents in the range, in * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of * those bios have completed by the time we get the page fault error, * which we return back to our caller - we should only return EIOCBQUEUED * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { btrfs_free_extent_map(em); ret = -EAGAIN; goto unlock_err; } if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, &len, flags); if (ret < 0) goto unlock_err; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { u64 release_offset; u64 release_len = 0; if (dio_data->nocow_done) { release_offset = start; release_len = data_alloc_len; } else if (len < data_alloc_len) { release_offset = start + len; release_len = data_alloc_len - len; } if (release_len > 0) btrfs_free_reserved_data_space(BTRFS_I(inode), dio_data->data_reserved, release_offset, release_len); } } /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does * that, since we have locked only the parts we are performing I/O in. */ if ((em->disk_bytenr == EXTENT_MAP_HOLE) || ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; btrfs_free_extent_map(em); /* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, * writes only hold it for this part. We hold the extent lock until * we're completely done with the extent map to make sure it remains * valid. */ if (write) unlock_bits |= EXTENT_DIO_LOCKED; btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, &cached_state); /* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend) btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, lockend, NULL); return 0; unlock_err: /* * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. */ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), dio_data->data_reserved, start, data_alloc_len); extent_changeset_free(dio_data->data_reserved); } return ret; } static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); return 0; } if (submitted < length) { pos += submitted; length -= submitted; if (write) btrfs_finish_ordered_extent(dio_data->ordered, pos, length, false); else btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { btrfs_put_ordered_extent(dio_data->ordered); dio_data->ordered = NULL; } if (write) extent_changeset_free(dio_data->data_reserved); return ret; } static void btrfs_dio_end_io(struct btrfs_bio *bbio) { struct btrfs_dio_private *dip = container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; if (bio->bi_status) { btrfs_warn(inode->root->fs_info, "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", btrfs_ino(inode), bio->bi_opf, dip->file_offset, dip->bytes, bio->bi_status); } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { btrfs_finish_ordered_extent(bbio->ordered, dip->file_offset, dip->bytes, !bio->bi_status); } else { btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; iomap_dio_bio_end_io(bio); } static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; struct btrfs_ordered_extent *new; int ret; /* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) return -EINVAL; /* No need to split if the ordered extent covers the entire bio. */ if (ordered->disk_num_bytes == len) { refcount_inc(&ordered->refs); bbio->ordered = ordered; return 0; } /* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, ordered->num_bytes, len, ordered->disk_bytenr); if (ret) return ret; } new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return PTR_ERR(new); bbio->ordered = new; return 0; } static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset, btrfs_dio_end_io, bio->bi_private); dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; dio_data->submitted += bio->bi_iter.bi_size; /* * Check if we are doing a partial write. If we are, we need to split * the ordered extent to match the submitted bio. Hang on to the * remaining unfinishable ordered_extent in dio_data so that it can be * cancelled in iomap_end to avoid a deadlock wherein faulting the * remaining pages is blocked on the outstanding ordered extent. */ if (iter->flags & IOMAP_WRITE) { int ret; ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { btrfs_finish_ordered_extent(dio_data->ordered, file_offset, dip->bytes, !ret); bio->bi_status = errno_to_blk_status(ret); iomap_dio_bio_end_io(bio); return; } } btrfs_submit_bbio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, const struct iov_iter *iter, loff_t offset) { const u32 blocksize_mask = fs_info->sectorsize - 1; if (offset & blocksize_mask) return -EINVAL; if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; return 0; } ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos; ssize_t written = 0; ssize_t written_buffered; size_t prev_left = 0; loff_t endbyte; ssize_t ret; unsigned int ilock_flags = 0; struct iomap_dio *dio; const u64 data_profile = btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_PROFILE_MASK; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; /* * If the write DIO is within EOF, use a shared lock and also only if * security bits will likely not be dropped by file_remove_privs() called * from btrfs_write_check(). Either will need to be rechecked after the * lock was acquired. */ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; /* * If our data profile has duplication (either extra mirrors or RAID56), * we can not trust the direct IO buffer, the content may change during * writeback and cause different contents written to different mirrors. * * Thus only RAID0 and SINGLE can go true zero-copy direct IO. */ if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0) goto buffered; relock: ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; /* Shared lock cannot be used with security bits set. */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } ret = generic_write_checks(iocb, from); if (ret <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return ret; } ret = btrfs_write_check(iocb, ret); if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } pos = iocb->ki_pos; /* * Re-check since file size may have changed just before taking the * lock or pos may have changed because of O_APPEND in generic_write_check() */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && pos + iov_iter_count(from) > i_size_read(inode)) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } if (check_direct_IO(fs_info, from, pos)) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } /* * We can't control the folios being passed in, applications can write * to them while a direct IO write is in progress. This means the * content might change after we calculated the data checksum. * Therefore we can end up storing a checksum that doesn't match the * persisted data. * * To be extra safe and avoid false data checksum mismatch, if the * inode requires data checksum, just fallback to buffered IO. * For buffered IO we have full control of page cache and can ensure * no one is modifying the content during writeback. */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create * an ordered extent, and after that it will fault in the pages that the * iov_iter refers to. During the fault in we end up in the readahead * pages code (starting at btrfs_readahead()), which will lock the range, * find that ordered extent and then wait for it to complete (at * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since * obviously the ordered extent can never complete as we didn't submit * yet the respective bio(s). This always happens when the buffer is * memory mapped to the same file range, since the iomap DIO code always * invalidates pages in the target file range (after starting and waiting * for any writeback). * * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ again: from->nofault = true; dio = btrfs_dio_write(iocb, from, written); from->nofault = false; if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); } else { /* * If we have a synchronous write, we must make sure the fsync * triggered by the iomap_dio_complete() call below doesn't * deadlock on the inode lock - we are already holding it and we * can't call it after unlocking because we may need to complete * partial writes due to the input buffer (or parts of it) not * being already faulted in. */ ASSERT(current->journal_info == NULL); current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; ret = iomap_dio_complete(dio); current->journal_info = NULL; } /* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0) written = ret; if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(from); /* * We have more data left to write. Try to fault in as many as * possible of the remainder pages and retry. We do this without * releasing and locking again the inode, to prevent races with * truncate. * * Also, in case the iov refers to pages in the file range of the * file we want to write to (due to a mmap), we could enter an * infinite loop if we retry after faulting the pages in, since * iomap will invalidate any pages in the range early on, before * it tries to fault in the pages of the iov. So we keep track of * how much was left of iov in the previous EFAULT and fallback * to buffered IO in case we haven't made any progress. */ if (left == prev_left) { ret = -ENOTBLK; } else { fault_in_iov_iter_readable(from, left); prev_left = left; goto again; } } btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); /* * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. */ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, * because even if we end up not blocking during the buffered IO attempt * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { ret = written_buffered; goto out; } /* * Ensure all data is persisted. We want the next direct IO read to be * able to read what was just written. */ endbyte = pos + written_buffered - 1; ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); if (ret) goto out; ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (ret) goto out; written += written_buffered; iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: return ret < 0 ? ret : written; } static int check_direct_read(struct btrfs_fs_info *fs_info, const struct iov_iter *iter, loff_t offset) { int ret; int i, seg; ret = check_direct_IO(fs_info, iter, offset); if (ret < 0) return ret; if (!iter_is_iovec(iter)) return 0; for (seg = 0; seg < iter->nr_segs; seg++) { for (i = seg + 1; i < iter->nr_segs; i++) { const struct iovec *iov1 = iter_iov(iter) + seg; const struct iovec *iov2 = iter_iov(iter) + i; if (iov1->iov_base == iov2->iov_base) return -EINVAL; } } return 0; } ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); size_t prev_left = 0; ssize_t read = 0; ssize_t ret; if (fsverity_active(inode)) return 0; if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) return 0; btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); again: /* * This is similar to what we do for direct IO writes, see the comment * at btrfs_direct_write(), but we also disable page faults in addition * to disabling them only at the iov_iter level. This is because when * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), * which can still trigger page fault ins despite having set ->nofault * to true of our 'to' iov_iter. * * The difference to direct IO writes is that we deadlock when trying * to lock the extent range in the inode's tree during he page reads * triggered by the fault in (while for writes it is due to waiting for * our own ordered extent). This is because for direct IO reads, * btrfs_dio_iomap_begin() returns with the extent range locked, which * is only unlocked in the endio callback (end_bio_extent_readpage()). */ pagefault_disable(); to->nofault = true; ret = btrfs_dio_read(iocb, to, read); to->nofault = false; pagefault_enable(); /* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0) read = ret; if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(to); if (left == prev_left) { /* * We didn't make any progress since the last attempt, * fallback to a buffered read for the remainder of the * range. This is just to avoid any possibility of looping * for too long. */ ret = read; } else { /* * We made some progress since the last retry or this is * the first time we are retrying. Fault in as many pages * as possible and retry. */ fault_in_iov_iter_writeable(to, left); prev_left = left; goto again; } } btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read; } int __init btrfs_init_dio(void) { if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) return -ENOMEM; return 0; } void __cold btrfs_destroy_dio(void) { bioset_exit(&btrfs_dio_bioset); }
45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 // SPDX-License-Identifier: 0BSD /* * Branch/Call/Jump (BCJ) filter decoders * * Authors: Lasse Collin <lasse.collin@tukaani.org> * Igor Pavlov <https://7-zip.org/> */ #include "xz_private.h" /* * The rest of the file is inside this ifdef. It makes things a little more * convenient when building without support for any BCJ filters. */ #ifdef XZ_DEC_BCJ struct xz_dec_bcj { /* Type of the BCJ filter being used */ enum { BCJ_X86 = 4, /* x86 or x86-64 */ BCJ_POWERPC = 5, /* Big endian only */ BCJ_ARM = 7, /* Little endian only */ BCJ_ARMTHUMB = 8, /* Little endian only */ BCJ_SPARC = 9, /* Big or little endian */ BCJ_ARM64 = 10, /* AArch64 */ BCJ_RISCV = 11 /* RV32GQC_Zfh, RV64GQC_Zfh */ } type; /* * Return value of the next filter in the chain. We need to preserve * this information across calls, because we must not call the next * filter anymore once it has returned XZ_STREAM_END. */ enum xz_ret ret; /* True if we are operating in single-call mode. */ bool single_call; /* * Absolute position relative to the beginning of the uncompressed * data (in a single .xz Block). We care only about the lowest 32 * bits so this doesn't need to be uint64_t even with big files. */ uint32_t pos; /* x86 filter state */ uint32_t x86_prev_mask; /* Temporary space to hold the variables from struct xz_buf */ uint8_t *out; size_t out_pos; size_t out_size; struct { /* Amount of already filtered data in the beginning of buf */ size_t filtered; /* Total amount of data currently stored in buf */ size_t size; /* * Buffer to hold a mix of filtered and unfiltered data. This * needs to be big enough to hold Alignment + 2 * Look-ahead: * * Type Alignment Look-ahead * x86 1 4 * PowerPC 4 0 * IA-64 16 0 * ARM 4 0 * ARM-Thumb 2 2 * SPARC 4 0 */ uint8_t buf[16]; } temp; }; #ifdef XZ_DEC_X86 /* * This is used to test the most significant byte of a memory address * in an x86 instruction. */ static inline int bcj_x86_test_msbyte(uint8_t b) { return b == 0x00 || b == 0xFF; } static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { static const bool mask_to_allowed_status[8] = { true, true, true, false, true, false, false, false }; static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; size_t i; size_t prev_pos = (size_t)-1; uint32_t prev_mask = s->x86_prev_mask; uint32_t src; uint32_t dest; uint32_t j; uint8_t b; if (size <= 4) return 0; size -= 4; for (i = 0; i < size; ++i) { if ((buf[i] & 0xFE) != 0xE8) continue; prev_pos = i - prev_pos; if (prev_pos > 3) { prev_mask = 0; } else { prev_mask = (prev_mask << (prev_pos - 1)) & 7; if (prev_mask != 0) { b = buf[i + 4 - mask_to_bit_num[prev_mask]]; if (!mask_to_allowed_status[prev_mask] || bcj_x86_test_msbyte(b)) { prev_pos = i; prev_mask = (prev_mask << 1) | 1; continue; } } } prev_pos = i; if (bcj_x86_test_msbyte(buf[i + 4])) { src = get_unaligned_le32(buf + i + 1); while (true) { dest = src - (s->pos + (uint32_t)i + 5); if (prev_mask == 0) break; j = mask_to_bit_num[prev_mask] * 8; b = (uint8_t)(dest >> (24 - j)); if (!bcj_x86_test_msbyte(b)) break; src = dest ^ (((uint32_t)1 << (32 - j)) - 1); } dest &= 0x01FFFFFF; dest |= (uint32_t)0 - (dest & 0x01000000); put_unaligned_le32(dest, buf + i + 1); i += 4; } else { prev_mask = (prev_mask << 1) | 1; } } prev_pos = i - prev_pos; s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); return i; } #endif #ifdef XZ_DEC_POWERPC static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t instr; size &= ~(size_t)3; for (i = 0; i < size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr & 0xFC000003) == 0x48000001) { instr &= 0x03FFFFFC; instr -= s->pos + (uint32_t)i; instr &= 0x03FFFFFC; instr |= 0x48000001; put_unaligned_be32(instr, buf + i); } } return i; } #endif #ifdef XZ_DEC_ARM static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t addr; size &= ~(size_t)3; for (i = 0; i < size; i += 4) { if (buf[i + 3] == 0xEB) { addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) | ((uint32_t)buf[i + 2] << 16); addr <<= 2; addr -= s->pos + (uint32_t)i + 8; addr >>= 2; buf[i] = (uint8_t)addr; buf[i + 1] = (uint8_t)(addr >> 8); buf[i + 2] = (uint8_t)(addr >> 16); } } return i; } #endif #ifdef XZ_DEC_ARMTHUMB static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t addr; if (size < 4) return 0; size -= 4; for (i = 0; i <= size; i += 2) { if ((buf[i + 1] & 0xF8) == 0xF0 && (buf[i + 3] & 0xF8) == 0xF8) { addr = (((uint32_t)buf[i + 1] & 0x07) << 19) | ((uint32_t)buf[i] << 11) | (((uint32_t)buf[i + 3] & 0x07) << 8) | (uint32_t)buf[i + 2]; addr <<= 1; addr -= s->pos + (uint32_t)i + 4; addr >>= 1; buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); buf[i] = (uint8_t)(addr >> 11); buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); buf[i + 2] = (uint8_t)addr; i += 2; } } return i; } #endif #ifdef XZ_DEC_SPARC static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t instr; size &= ~(size_t)3; for (i = 0; i < size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { instr <<= 2; instr -= s->pos + (uint32_t)i; instr >>= 2; instr = ((uint32_t)0x40000000 - (instr & 0x400000)) | 0x40000000 | (instr & 0x3FFFFF); put_unaligned_be32(instr, buf + i); } } return i; } #endif #ifdef XZ_DEC_ARM64 static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t instr; uint32_t addr; size &= ~(size_t)3; for (i = 0; i < size; i += 4) { instr = get_unaligned_le32(buf + i); if ((instr >> 26) == 0x25) { /* BL instruction */ addr = instr - ((s->pos + (uint32_t)i) >> 2); instr = 0x94000000 | (addr & 0x03FFFFFF); put_unaligned_le32(instr, buf + i); } else if ((instr & 0x9F000000) == 0x90000000) { /* ADRP instruction */ addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC); /* Only convert values in the range +/-512 MiB. */ if ((addr + 0x020000) & 0x1C0000) continue; addr -= (s->pos + (uint32_t)i) >> 12; instr &= 0x9000001F; instr |= (addr & 3) << 29; instr |= (addr & 0x03FFFC) << 3; instr |= (0U - (addr & 0x020000)) & 0xE00000; put_unaligned_le32(instr, buf + i); } } return i; } #endif #ifdef XZ_DEC_RISCV static size_t bcj_riscv(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t b1; uint32_t b2; uint32_t b3; uint32_t instr; uint32_t instr2; uint32_t instr2_rs1; uint32_t addr; if (size < 8) return 0; size -= 8; for (i = 0; i <= size; i += 2) { instr = buf[i]; if (instr == 0xEF) { /* JAL */ b1 = buf[i + 1]; if ((b1 & 0x0D) != 0) continue; b2 = buf[i + 2]; b3 = buf[i + 3]; addr = ((b1 & 0xF0) << 13) | (b2 << 9) | (b3 << 1); addr -= s->pos + (uint32_t)i; buf[i + 1] = (uint8_t)((b1 & 0x0F) | ((addr >> 8) & 0xF0)); buf[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) | ((addr >> 7) & 0x10) | ((addr << 4) & 0xE0)); buf[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) | ((addr >> 13) & 0x80)); i += 4 - 2; } else if ((instr & 0x7F) == 0x17) { /* AUIPC */ instr |= (uint32_t)buf[i + 1] << 8; instr |= (uint32_t)buf[i + 2] << 16; instr |= (uint32_t)buf[i + 3] << 24; if (instr & 0xE80) { /* AUIPC's rd doesn't equal x0 or x2. */ instr2 = get_unaligned_le32(buf + i + 4); if (((instr << 8) ^ (instr2 - 3)) & 0xF8003) { i += 6 - 2; continue; } addr = (instr & 0xFFFFF000) + (instr2 >> 20); instr = 0x17 | (2 << 7) | (instr2 << 12); instr2 = addr; } else { /* AUIPC's rd equals x0 or x2. */ instr2_rs1 = instr >> 27; if ((uint32_t)((instr - 0x3117) << 18) >= (instr2_rs1 & 0x1D)) { i += 4 - 2; continue; } addr = get_unaligned_be32(buf + i + 4); addr -= s->pos + (uint32_t)i; instr2 = (instr >> 12) | (addr << 20); instr = 0x17 | (instr2_rs1 << 7) | ((addr + 0x800) & 0xFFFFF000); } put_unaligned_le32(instr, buf + i); put_unaligned_le32(instr2, buf + i + 4); i += 8 - 2; } } return i; } #endif /* * Apply the selected BCJ filter. Update *pos and s->pos to match the amount * of data that got filtered. * * NOTE: This is implemented as a switch statement to avoid using function * pointers, which could be problematic in the kernel boot code, which must * avoid pointers to static data (at least on x86). */ static void bcj_apply(struct xz_dec_bcj *s, uint8_t *buf, size_t *pos, size_t size) { size_t filtered; buf += *pos; size -= *pos; switch (s->type) { #ifdef XZ_DEC_X86 case BCJ_X86: filtered = bcj_x86(s, buf, size); break; #endif #ifdef XZ_DEC_POWERPC case BCJ_POWERPC: filtered = bcj_powerpc(s, buf, size); break; #endif #ifdef XZ_DEC_ARM case BCJ_ARM: filtered = bcj_arm(s, buf, size); break; #endif #ifdef XZ_DEC_ARMTHUMB case BCJ_ARMTHUMB: filtered = bcj_armthumb(s, buf, size); break; #endif #ifdef XZ_DEC_SPARC case BCJ_SPARC: filtered = bcj_sparc(s, buf, size); break; #endif #ifdef XZ_DEC_ARM64 case BCJ_ARM64: filtered = bcj_arm64(s, buf, size); break; #endif #ifdef XZ_DEC_RISCV case BCJ_RISCV: filtered = bcj_riscv(s, buf, size); break; #endif default: /* Never reached but silence compiler warnings. */ filtered = 0; break; } *pos += filtered; s->pos += filtered; } /* * Flush pending filtered data from temp to the output buffer. * Move the remaining mixture of possibly filtered and unfiltered * data to the beginning of temp. */ static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) { size_t copy_size; copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); memcpy(b->out + b->out_pos, s->temp.buf, copy_size); b->out_pos += copy_size; s->temp.filtered -= copy_size; s->temp.size -= copy_size; memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); } /* * The BCJ filter functions are primitive in sense that they process the * data in chunks of 1-16 bytes. To hide this issue, this function does * some buffering. */ enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, struct xz_buf *b) { size_t out_start; /* * Flush pending already filtered data to the output buffer. Return * immediately if we couldn't flush everything, or if the next * filter in the chain had already returned XZ_STREAM_END. */ if (s->temp.filtered > 0) { bcj_flush(s, b); if (s->temp.filtered > 0) return XZ_OK; if (s->ret == XZ_STREAM_END) return XZ_STREAM_END; } /* * If we have more output space than what is currently pending in * temp, copy the unfiltered data from temp to the output buffer * and try to fill the output buffer by decoding more data from the * next filter in the chain. Apply the BCJ filter on the new data * in the output buffer. If everything cannot be filtered, copy it * to temp and rewind the output buffer position accordingly. * * This needs to be always run when temp.size == 0 to handle a special * case where the output buffer is full and the next filter has no * more output coming but hasn't returned XZ_STREAM_END yet. */ if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { out_start = b->out_pos; memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); b->out_pos += s->temp.size; s->ret = xz_dec_lzma2_run(lzma2, b); if (s->ret != XZ_STREAM_END && (s->ret != XZ_OK || s->single_call)) return s->ret; bcj_apply(s, b->out, &out_start, b->out_pos); /* * As an exception, if the next filter returned XZ_STREAM_END, * we can do that too, since the last few bytes that remain * unfiltered are meant to remain unfiltered. */ if (s->ret == XZ_STREAM_END) return XZ_STREAM_END; s->temp.size = b->out_pos - out_start; b->out_pos -= s->temp.size; memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); /* * If there wasn't enough input to the next filter to fill * the output buffer with unfiltered data, there's no point * to try decoding more data to temp. */ if (b->out_pos + s->temp.size < b->out_size) return XZ_OK; } /* * We have unfiltered data in temp. If the output buffer isn't full * yet, try to fill the temp buffer by decoding more data from the * next filter. Apply the BCJ filter on temp. Then we hopefully can * fill the actual output buffer by copying filtered data from temp. * A mix of filtered and unfiltered data may be left in temp; it will * be taken care on the next call to this function. */ if (b->out_pos < b->out_size) { /* Make b->out{,_pos,_size} temporarily point to s->temp. */ s->out = b->out; s->out_pos = b->out_pos; s->out_size = b->out_size; b->out = s->temp.buf; b->out_pos = s->temp.size; b->out_size = sizeof(s->temp.buf); s->ret = xz_dec_lzma2_run(lzma2, b); s->temp.size = b->out_pos; b->out = s->out; b->out_pos = s->out_pos; b->out_size = s->out_size; if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) return s->ret; bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); /* * If the next filter returned XZ_STREAM_END, we mark that * everything is filtered, since the last unfiltered bytes * of the stream are meant to be left as is. */ if (s->ret == XZ_STREAM_END) s->temp.filtered = s->temp.size; bcj_flush(s, b); if (s->temp.filtered > 0) return XZ_OK; } return s->ret; } struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) { struct xz_dec_bcj *s = kmalloc_obj(*s); if (s != NULL) s->single_call = single_call; return s; } enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) { switch (id) { #ifdef XZ_DEC_X86 case BCJ_X86: #endif #ifdef XZ_DEC_POWERPC case BCJ_POWERPC: #endif #ifdef XZ_DEC_ARM case BCJ_ARM: #endif #ifdef XZ_DEC_ARMTHUMB case BCJ_ARMTHUMB: #endif #ifdef XZ_DEC_SPARC case BCJ_SPARC: #endif #ifdef XZ_DEC_ARM64 case BCJ_ARM64: #endif #ifdef XZ_DEC_RISCV case BCJ_RISCV: #endif break; default: /* Unsupported Filter ID */ return XZ_OPTIONS_ERROR; } s->type = id; s->ret = XZ_OK; s->pos = 0; s->x86_prev_mask = 0; s->temp.filtered = 0; s->temp.size = 0; return XZ_OK; } #endif
1451 1450 1448 2 1 1 6 5 5 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API support for SHA-1 and HMAC-SHA1 * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright 2025 Google LLC */ #include <crypto/internal/hash.h> #include <crypto/sha1.h> #include <linux/kernel.h> #include <linux/module.h> /* * Export and import functions. crypto_shash wants a particular format that * matches that used by some legacy drivers. It currently is the same as the * library SHA context, except the value in bytecount must be block-aligned and * the remainder must be stored in an extra u8 appended to the struct. */ #define SHA1_SHASH_STATE_SIZE (sizeof(struct sha1_ctx) + 1) static_assert(sizeof(struct sha1_ctx) == sizeof(struct sha1_state)); static_assert(offsetof(struct sha1_ctx, state) == offsetof(struct sha1_state, state)); static_assert(offsetof(struct sha1_ctx, bytecount) == offsetof(struct sha1_state, count)); static_assert(offsetof(struct sha1_ctx, buf) == offsetof(struct sha1_state, buffer)); static int __crypto_sha1_export(const struct sha1_ctx *ctx0, void *out) { struct sha1_ctx ctx = *ctx0; unsigned int partial; u8 *p = out; partial = ctx.bytecount % SHA1_BLOCK_SIZE; ctx.bytecount -= partial; memcpy(p, &ctx, sizeof(ctx)); p += sizeof(ctx); *p = partial; return 0; } static int __crypto_sha1_import(struct sha1_ctx *ctx, const void *in) { const u8 *p = in; memcpy(ctx, p, sizeof(*ctx)); p += sizeof(*ctx); ctx->bytecount += *p; return 0; } static int __crypto_sha1_export_core(const struct sha1_ctx *ctx, void *out) { memcpy(out, ctx, offsetof(struct sha1_ctx, buf)); return 0; } static int __crypto_sha1_import_core(struct sha1_ctx *ctx, const void *in) { memcpy(ctx, in, offsetof(struct sha1_ctx, buf)); return 0; } const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 }; EXPORT_SYMBOL_GPL(sha1_zero_message_hash); #define SHA1_CTX(desc) ((struct sha1_ctx *)shash_desc_ctx(desc)) static int crypto_sha1_init(struct shash_desc *desc) { sha1_init(SHA1_CTX(desc)); return 0; } static int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha1_update(SHA1_CTX(desc), data, len); return 0; } static int crypto_sha1_final(struct shash_desc *desc, u8 *out) { sha1_final(SHA1_CTX(desc), out); return 0; } static int crypto_sha1_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha1(data, len, out); return 0; } static int crypto_sha1_export(struct shash_desc *desc, void *out) { return __crypto_sha1_export(SHA1_CTX(desc), out); } static int crypto_sha1_import(struct shash_desc *desc, const void *in) { return __crypto_sha1_import(SHA1_CTX(desc), in); } static int crypto_sha1_export_core(struct shash_desc *desc, void *out) { return __crypto_sha1_export_core(SHA1_CTX(desc), out); } static int crypto_sha1_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha1_import_core(SHA1_CTX(desc), in); } #define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha1_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha1_preparekey(HMAC_SHA1_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha1_init(struct shash_desc *desc) { hmac_sha1_init(HMAC_SHA1_CTX(desc), HMAC_SHA1_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha1_update(HMAC_SHA1_CTX(desc), data, len); return 0; } static int crypto_hmac_sha1_final(struct shash_desc *desc, u8 *out) { hmac_sha1_final(HMAC_SHA1_CTX(desc), out); return 0; } static int crypto_hmac_sha1_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha1(HMAC_SHA1_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha1_export(struct shash_desc *desc, void *out) { return __crypto_sha1_export(&HMAC_SHA1_CTX(desc)->sha_ctx, out); } static int crypto_hmac_sha1_import(struct shash_desc *desc, const void *in) { struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc); ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate; return __crypto_sha1_import(&ctx->sha_ctx, in); } static int crypto_hmac_sha1_export_core(struct shash_desc *desc, void *out) { return __crypto_sha1_export_core(&HMAC_SHA1_CTX(desc)->sha_ctx, out); } static int crypto_hmac_sha1_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc); ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate; return __crypto_sha1_import_core(&ctx->sha_ctx, in); } static struct shash_alg algs[] = { { .base.cra_name = "sha1", .base.cra_driver_name = "sha1-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA1_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA1_DIGEST_SIZE, .init = crypto_sha1_init, .update = crypto_sha1_update, .final = crypto_sha1_final, .digest = crypto_sha1_digest, .export = crypto_sha1_export, .import = crypto_sha1_import, .export_core = crypto_sha1_export_core, .import_core = crypto_sha1_import_core, .descsize = sizeof(struct sha1_ctx), .statesize = SHA1_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha1)", .base.cra_driver_name = "hmac-sha1-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA1_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha1_key), .base.cra_module = THIS_MODULE, .digestsize = SHA1_DIGEST_SIZE, .setkey = crypto_hmac_sha1_setkey, .init = crypto_hmac_sha1_init, .update = crypto_hmac_sha1_update, .final = crypto_hmac_sha1_final, .digest = crypto_hmac_sha1_digest, .export = crypto_hmac_sha1_export, .import = crypto_hmac_sha1_import, .export_core = crypto_hmac_sha1_export_core, .import_core = crypto_hmac_sha1_import_core, .descsize = sizeof(struct hmac_sha1_ctx), .statesize = SHA1_SHASH_STATE_SIZE, }, }; static int __init crypto_sha1_mod_init(void) { return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } module_init(crypto_sha1_mod_init); static void __exit crypto_sha1_mod_exit(void) { crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_exit(crypto_sha1_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto API support for SHA-1 and HMAC-SHA1"); MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-lib"); MODULE_ALIAS_CRYPTO("hmac(sha1)"); MODULE_ALIAS_CRYPTO("hmac-sha1-lib");
30 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0-only */ /* * irq.h: in kernel interrupt controller related definitions * Copyright (c) 2007, Intel Corporation. * * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> */ #ifndef __IRQ_H #define __IRQ_H #include <linux/mm_types.h> #include <linux/hrtimer.h> #include <linux/kvm_host.h> #include <linux/spinlock.h> #include <kvm/iodev.h> #include "lapic.h" #ifdef CONFIG_KVM_IOAPIC #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) struct kvm; struct kvm_vcpu; struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; u8 poll; u8 special_mask; u8 init_state; u8 auto_eoi; u8 rotate_on_auto_eoi; u8 special_fully_nested_mode; u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; struct kvm_pic { spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; unsigned long irq_states[PIC_NUM_PINS]; }; int kvm_pic_init(struct kvm *kvm); void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm); int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); static inline int irqchip_full(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode == KVM_IRQCHIP_KERNEL; } #else /* CONFIG_KVM_IOAPIC */ static __always_inline int irqchip_full(struct kvm *kvm) { return false; } #endif static inline int pic_in_kernel(struct kvm *kvm) { return irqchip_full(kvm); } static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_in_kernel(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); return mode != KVM_IRQCHIP_NONE; } void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif
1478 1478 1478 726 1477 1475 1480 1478 1479 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * */ #include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> #include <asm/insn.h> int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); BUG_ON(insn.length != 2 && insn.length != 5); return insn.length; } struct jump_label_patch { const void *code; int size; }; static struct jump_label_patch __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code, *nop; const void *addr, *dest; int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); size = arch_jump_entry_size(entry); switch (size) { case JMP8_INSN_SIZE: code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; case JMP32_INSN_SIZE: code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; default: BUG(); } if (type == JUMP_LABEL_JMP) expect = nop; else expect = code; if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) code = nop; return (struct jump_label_patch){.code = code, .size = size}; } static __always_inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that * system_state is SYSTEM_BOOTING guarantees it. It will be set to * SYSTEM_SCHEDULING before other cores are awaken and before the * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { mutex_lock(&text_mutex); __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { jump_label_transform(entry, type, 0); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* * Fallback to the non-batching mode. */ arch_jump_label_transform(entry, type); return true; } mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); }
2 1 1 2 12 4 11 7 11 11 6 10 9 8 2 5 2 14 1 13 1 9 3 1 1 12 1 1 2 10 16 16 16 2 15 5 3 2 10 1 8 1 9 20 2 18 19 9 9 1 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* Block- or MTD-based romfs * * Copyright © 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Derived from: ROMFS file system, Linux implementation * * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu> * * Using parts of the minix filesystem * Copyright © 1991, 1992 Linus Torvalds * * and parts of the affs filesystem additionally * Copyright © 1993 Ray Burr * Copyright © 1996 Hans-Joachim Widmaier * * Changes * Changed for 2.1.19 modules * Jan 1997 Initial release * Jun 1997 2.1.43+ changes * Proper page locking in read_folio * Changed to work with 2.1.45+ fs * Jul 1997 Fixed follow_link * 2.1.47 * lookup shouldn't return -ENOENT * from Horst von Brand: * fail on wrong checksum * double unlock_super was possible * correct namelen for statfs * spotted by Bill Hawes: * readlink shouldn't iput() * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() * exposed a problem in readdir * 2.1.107 code-freeze spellchecker run * Aug 1998 2.1.118+ VFS changes * Sep 1998 2.1.122 another VFS change (follow_link) * Apr 1999 2.2.7 no more EBADF checking in * lookup/readdir, use ERR_PTR * Jun 1999 2.3.6 d_alloc_root use changed * 2.3.9 clean up usage of ENOENT/negative * dentries in lookup * clean up page flags setting * (error, uptodate, locking) in * in read_folio * use init_special_inode for * fifos/sockets (and streamline) in * read_inode, fix _ops table order * Aug 1999 2.3.16 __initfunc() => __init change * Oct 1999 2.3.24 page->owner hack obsoleted * Nov 1999 2.3.27 2.3.25+ page->offset => index change * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/statfs.h> #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> #include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; static const umode_t romfs_modemap[8] = { 0, /* hard link */ S_IFDIR | 0644, /* directory */ S_IFREG | 0644, /* regular file */ S_IFLNK | 0777, /* symlink */ S_IFBLK | 0600, /* blockdev */ S_IFCHR | 0600, /* chardev */ S_IFSOCK | 0644, /* socket */ S_IFIFO | 0644 /* FIFO */ }; static const unsigned char romfs_dtype_table[] = { DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO }; static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); /* * read a page worth of data from the image */ static int romfs_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; loff_t offset, size; unsigned long fillsize, pos; void *buf; int ret; buf = kmap_local_folio(folio, 0); offset = folio_pos(folio); size = i_size_read(inode); fillsize = 0; ret = 0; if (offset < size) { size -= offset; fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; pos = ROMFS_I(inode)->i_dataoffset + offset; ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); if (ret < 0) { fillsize = 0; ret = -EIO; } } buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; } static const struct address_space_operations romfs_aops = { .read_folio = romfs_read_folio }; /* * read the entries from a directory */ static int romfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *i = file_inode(file); struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ int ret; maxoff = romfs_maxsize(i->i_sb); offset = ctx->pos; if (!offset) { offset = i->i_ino & ROMFH_MASK; ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto out; offset = be32_to_cpu(ri.spec) & ROMFH_MASK; } /* Not really failsafe, but we are read-only... */ for (;;) { if (!offset || offset >= maxoff) { offset = maxoff; ctx->pos = offset; goto out; } ctx->pos = offset; /* Fetch inode info */ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto out; j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, sizeof(fsname) - 1); if (j < 0) goto out; ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); if (ret < 0) goto out; fsname[j] = '\0'; ino = offset; nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) ino = be32_to_cpu(ri.spec); if (!dir_emit(ctx, fsname, j, ino, romfs_dtype_table[nextfh & ROMFH_TYPE])) goto out; offset = nextfh & ROMFH_MASK; } out: return 0; } /* * look up an entry in a directory */ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long offset, maxoff; struct inode *inode = NULL; struct romfs_inode ri; const char *name; /* got from dentry */ int len, ret; offset = dir->i_ino & ROMFH_MASK; ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); if (ret < 0) goto error; /* search all the file entries in the list starting from the one * pointed to by the directory's special data */ maxoff = romfs_maxsize(dir->i_sb); offset = be32_to_cpu(ri.spec) & ROMFH_MASK; name = dentry->d_name.name; len = dentry->d_name.len; for (;;) { if (!offset || offset >= maxoff) break; ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); if (ret < 0) goto error; /* try to match the first 16 bytes of name */ ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, len); if (ret < 0) goto error; if (ret == 1) { /* Hard link handling */ if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) offset = be32_to_cpu(ri.spec) & ROMFH_MASK; inode = romfs_iget(dir->i_sb, offset); break; } /* next entry */ offset = be32_to_cpu(ri.next) & ROMFH_MASK; } return d_splice_alias(inode, dentry); error: return ERR_PTR(ret); } static const struct file_operations romfs_dir_operations = { .read = generic_read_dir, .iterate_shared = romfs_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations romfs_dir_inode_operations = { .lookup = romfs_lookup, }; /* * get a romfs inode based on its position in the image (which doubles as the * inode number) */ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) { struct romfs_inode_info *inode; struct romfs_inode ri; struct inode *i; unsigned long nlen; unsigned nextfh; int ret; umode_t mode; /* we might have to traverse a chain of "hard link" file entries to get * to the actual file */ for (;;) { ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); if (ret < 0) goto error; /* XXX: do romfs_checksum here too (with name) */ nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) break; pos = be32_to_cpu(ri.spec) & ROMFH_MASK; } /* determine the length of the filename */ nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); if (IS_ERR_VALUE(nlen)) goto eio; /* get an inode for this image position */ i = iget_locked(sb, pos); if (!i) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(i) & I_NEW)) return i; /* precalculate the data offset */ inode = ROMFS_I(i); inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; inode->i_dataoffset = pos + inode->i_metasize; set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); inode_set_mtime_to_ts(i, inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0))); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; switch (nextfh & ROMFH_TYPE) { case ROMFH_DIR: i->i_size = ROMFS_I(i)->i_metasize; i->i_op = &romfs_dir_inode_operations; i->i_fop = &romfs_dir_operations; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; case ROMFH_REG: i->i_fop = &romfs_ro_fops; i->i_data.a_ops = &romfs_aops; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; case ROMFH_SYM: i->i_op = &page_symlink_inode_operations; inode_nohighmem(i); i->i_data.a_ops = &romfs_aops; mode |= S_IRWXUGO; break; default: /* depending on MBZ for sock/fifos */ nextfh = be32_to_cpu(ri.spec); init_special_inode(i, mode, MKDEV(nextfh >> 16, nextfh & 0xffff)); break; } i->i_mode = mode; i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; eio: ret = -EIO; error: pr_err("read error for inode 0x%lx\n", pos); return ERR_PTR(ret); } /* * allocate a new inode */ static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } /* * return a spent inode to the slab cache */ static void romfs_free_inode(struct inode *inode) { kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } /* * get filesystem statistics */ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = 0; /* When calling huge_encode_dev(), * use sb->s_bdev->bd_dev when, * - CONFIG_ROMFS_ON_BLOCK defined * use sb->s_dev when, * - CONFIG_ROMFS_ON_BLOCK undefined and * - CONFIG_ROMFS_ON_MTD defined * leave id as 0 when, * - CONFIG_ROMFS_ON_BLOCK undefined and * - CONFIG_ROMFS_ON_MTD undefined */ if (sb->s_bdev) id = huge_encode_dev(sb->s_bdev->bd_dev); else if (sb->s_dev) id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; buf->f_bsize = ROMBSIZE; buf->f_bfree = buf->f_bavail = buf->f_ffree; buf->f_blocks = (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; buf->f_fsid = u64_to_fsid(id); return 0; } /* * remounting must involve read-only */ static int romfs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; return 0; } static const struct super_operations romfs_super_ops = { .alloc_inode = romfs_alloc_inode, .free_inode = romfs_free_inode, .statfs = romfs_statfs, }; /* * checksum check on part of a romfs filesystem */ static __u32 romfs_checksum(const void *data, int size) { const __be32 *ptr = data; __u32 sum; sum = 0; size >>= 2; while (size > 0) { sum += be32_to_cpu(*ptr++); size--; } return sum; } /* * fill in the superblock */ static int romfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct romfs_super_block *rsb; struct inode *root; unsigned long pos, img_size; const char *storage; size_t len; int ret; #ifdef CONFIG_BLOCK if (!sb->s_mtd) { if (!sb_set_blocksize(sb, ROMBSIZE)) { errorf(fc, "romfs: unable to set blocksize\n"); return -EINVAL; } } else { sb->s_blocksize = ROMBSIZE; sb->s_blocksize_bits = blksize_bits(ROMBSIZE); } #endif sb->s_maxbytes = 0xFFFFFFFF; sb->s_magic = ROMFS_MAGIC; sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_time_min = 0; sb->s_time_max = 0; sb->s_op = &romfs_super_ops; #ifdef CONFIG_ROMFS_ON_MTD /* Use same dev ID from the underlying mtdblock device */ if (sb->s_mtd) sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); #endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) return -ENOMEM; sb->s_fs_info = (void *) 512; ret = romfs_dev_read(sb, 0, rsb, 512); if (ret < 0) goto error_rsb; img_size = be32_to_cpu(rsb->size); if (sb->s_mtd && img_size > sb->s_mtd->size) goto error_rsb_inval; sb->s_fs_info = (void *) img_size; if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { if (!(fc->sb_flags & SB_SILENT)) errorf(fc, "VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { pr_err("bad initial checksum on dev %s.\n", sb->s_id); goto error_rsb_inval; } storage = sb->s_mtd ? "MTD" : "the block layer"; len = strnlen(rsb->name, ROMFS_MAXFN); if (!(fc->sb_flags & SB_SILENT)) pr_notice("Mounting image '%*.*s' through %s\n", (unsigned) len, (unsigned) len, rsb->name, storage); kfree(rsb); rsb = NULL; /* find the root directory */ pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; root = romfs_iget(sb, pos); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; return 0; error_rsb_inval: ret = -EINVAL; error_rsb: kfree(rsb); return ret; } /* * get a superblock for mounting */ static int romfs_get_tree(struct fs_context *fc) { int ret = -EINVAL; #ifdef CONFIG_ROMFS_ON_MTD ret = get_tree_mtd(fc, romfs_fill_super); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (ret == -EINVAL) ret = get_tree_bdev(fc, romfs_fill_super); #endif return ret; } static const struct fs_context_operations romfs_context_ops = { .get_tree = romfs_get_tree, .reconfigure = romfs_reconfigure, }; /* * Set up the filesystem mount context. */ static int romfs_init_fs_context(struct fs_context *fc) { fc->ops = &romfs_context_ops; return 0; } /* * destroy a romfs superblock in the appropriate manner */ static void romfs_kill_sb(struct super_block *sb) { generic_shutdown_super(sb); #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) { put_mtd_device(sb->s_mtd); sb->s_mtd = NULL; } #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); bdev_fput(sb->s_bdev_file); } #endif } static struct file_system_type romfs_fs_type = { .owner = THIS_MODULE, .name = "romfs", .init_fs_context = romfs_init_fs_context, .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("romfs"); /* * inode storage initialiser */ static void romfs_i_init_once(void *_inode) { struct romfs_inode_info *inode = _inode; inode_init_once(&inode->vfs_inode); } /* * romfs module initialisation */ static int __init init_romfs_fs(void) { int ret; pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); return -ENOMEM; } ret = register_filesystem(&romfs_fs_type); if (ret) { pr_err("Failed to register filesystem\n"); goto error_register; } return 0; error_register: kmem_cache_destroy(romfs_inode_cachep); return ret; } /* * romfs module removal */ static void __exit exit_romfs_fs(void) { unregister_filesystem(&romfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(romfs_inode_cachep); } module_init(init_romfs_fs); module_exit(exit_romfs_fs); MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
333 60 315 228 201 201 200 145 28 5 28 1 6 1 398 399 179 331 334 395 106 9 20 5 22 2 1 5 7 11 462 478 16 471 4 16 2 7 21 462 436 25 32 17 25 9 32 8 446 357 25 435 2 131 298 155 436 438 133 357 355 33 429 199 166 6 193 1 4 7 30 156 41 193 163 30 189 4 61 104 20 165 5 165 137 65 6 6 4 151 1 746 2 135 610 8 8 611 1 13 610 611 445 610 430 162 169 27 517 105 151 408 373 188 546 548 151 103 48 1 220 355 613 521 15 5 139 9 138 70 74 278 3 1 32 10 117 2 1 122 1 112 8 3 12 106 3 102 9 116 159 159 126 5 163 3 151 123 12 266 14 112 182 180 33 4 33 33 6 6 3 16 4 8 16 3 6 36 3 32 32 89 1 8 51 33 8 6 42 28 80 299 1 1 12 8 291 285 45 16 3 62 57 8 2 1 2 31 2 3 1 4 16 6 21 7 21 12 16 1 22 18 5 22 6 23 44 8 4 28 3 29 23 23 52 2 1 4 3 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to * indicate they support non-blocking reads or writes, we must clear it * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); do { if (!(fmode & FMODE_NOWAIT)) break; } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); } /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); struct address_space *mapping; folio_lock(folio); mapping = folio_mapping(folio); if (mapping) { WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ folio_wait_writeback(folio); if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: folio_unlock(folio); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); int err; if (!folio_test_uptodate(folio)) { folio_lock(folio); /* * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!folio->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } /* Folio is ok after all, we are done */ folio_unlock(folio); } return 0; error: folio_unlock(folio); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; ssize_t ret = 0; int page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_objs(struct page *, max_usage); spd->partial = kmalloc_objs(struct partial_page, max_usage); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * copy_splice_read - Copy data from a file and splice the copy into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function allocates a bunch of pages sufficient to hold the requested * amount of data (but limited by the remaining pipe capacity), passes it to * the file's ->read_iter() to read into and then splices the used pages into * the pipe. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); bv = kzalloc(array_size(npages, sizeof(bv[0])) + array_size(npages, sizeof(struct page *)), GFP_KERNEL); if (!bv) return -ENOMEM; pages = (struct page **)(bv + npages); npages = alloc_pages_bulk(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; } remain = len = min_t(size_t, len, npages * PAGE_SIZE); for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); bv[i].bv_page = pages[i]; bv[i].bv_offset = 0; bv[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; } /* * Callers of ->splice_read() expect -EAGAIN on "can't put anything in * there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ remain = ret; for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); *buf = (struct pipe_buffer) { .ops = &default_pipe_buf_ops, .page = bv[i].bv_page, .offset = 0, .len = chunk, }; pipe->head++; remain -= chunk; } kfree(bv); return ret; } EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array; ssize_t ret; if (!out->f_op->write_iter) return -EINVAL; array = kzalloc_objs(struct bio_vec, nbufs); if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; unsigned int head, tail; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kzalloc_objs(struct bio_vec, nbufs); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } bvec_set_page(&array[n], buf->page, this_len, buf->offset); left -= this_len; n++; } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; WARN_ONCE(ret > sd.total_len - left, "Splice Exceeded! ret=%zd tot=%zu left=%zu\n", ret, sd.total_len, left); sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); #ifdef CONFIG_NET /** * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct socket *sock = sock_from_file(out); struct bio_vec bvec[16]; struct msghdr msg = {}; ssize_t ret = 0; size_t spliced = 0; bool need_wakeup = false; pipe_lock(pipe); while (len > 0) { unsigned int head, tail, bc = 0; size_t remain = len; /* * Check for signal early to make process killable when there * are always buffers available */ ret = -ERESTARTSYS; if (signal_pending(current)) break; while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; if (spliced) goto out; ret = -EAGAIN; if (flags & SPLICE_F_NONBLOCK) goto out; ret = -ERESTARTSYS; if (signal_pending(current)) goto out; if (need_wakeup) { wakeup_pipe_writers(pipe); need_wakeup = false; } pipe_wait_readable(pipe); } head = pipe->head; tail = pipe->tail; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { tail++; continue; } seg = min_t(size_t, remain, buf->len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; break; } bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); remain -= seg; if (remain == 0 || bc >= ARRAY_SIZE(bvec)) break; tail++; } if (!bc) break; msg.msg_flags = MSG_SPLICE_PAGES; if (flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; if (out->f_flags & O_NONBLOCK) msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); ret = sock_sendmsg(sock, &msg); if (ret <= 0) break; spliced += ret; len -= ret; tail = pipe->tail; while (ret > 0) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; buf->len -= seg; ret -= seg; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; } } if (tail != pipe->tail) { pipe->tail = tail; if (pipe->files) need_wakeup = true; } } out: pipe_unlock(pipe); if (need_wakeup) wakeup_pipe_writers(pipe); return spliced ?: ret; } #endif static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Indicate to the caller that there was a premature EOF when reading from the * source and the caller didn't indicate they would be sending more data after * this. */ static void do_splice_eof(struct splice_desc *sd) { if (sd->splice_eof) sd->splice_eof(sd); } /* * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ static ssize_t do_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; if (!len) return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); /* * O_DIRECT and DAX don't deal with the pagecache, so we allocate a * buffer, copy into it and splice that into the pipe. */ if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from * @ppos: Input file offset * @pipe: Pipe to splice to * @len: Number of bytes to splice * @flags: Splice modifier flags (SPLICE_F_*) * * Splice the requested amount of data from the input file to the pipe. This * is synchronous as the caller must hold the pipe lock across the entire * operation. * * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ ssize_t vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; ssize_t ret, bytes; size_t len; int i, flags, more; /* * We require the input to be seekable, as we don't want to randomly * drop data for eg socket -> socket splicing. Use the piped splicing * for that! */ if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ bytes = 0; len = sd->total_len; /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; sd->flags &= ~SPLICE_F_NONBLOCK; /* * We signal MORE until we've read sufficient data to fulfill the * request and we keep signalling it if the caller set it. */ more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; read_len = ret; sd->total_len = read_len; /* * If we now have sufficient data to fulfill the request then * we clear SPLICE_F_MORE if it was not set initially. */ if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; read_failure: /* * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a * "->splice_in()" that returned EOF (ie zero) *and* we have sent at * least 1 byte *then* we will also do the ->splice_eof() call. */ if (ret == 0 && !more && len > 0 && bytes) do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; long ret; file_start_write(file); ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); file_end_write(file); return ret; } static int splice_file_range_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) { struct file *file = sd->u.file; if (file->f_op->splice_eof) file->f_op->splice_eof(file); } static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags, splice_direct_actor *actor) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .splice_eof = direct_file_splice_eof, .opos = opos, }; ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * * Callers already called rw_verify_area() on the entire range. */ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); /** * splice_file_range - splices data between two files for copy_file_range() * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * * Description: * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, min_t(size_t, len, MAX_RW_COUNT), 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } ret = rw_verify_area(READ, in, &offset, len); if (unlikely(ret < 0)) return ret; if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; } else { ret = -EINVAL; } if (ret > 0) { /* * Generate modify out before access in: * do_splice_from() may've already sent modify out, * and this ensures the events get merged. */ fsnotify_modify(out); fsnotify_access(in); } return ret; } static ssize_t __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe) { if (off_in) return -ESPIPE; pipe_clear_nowait(in); } if (opipe) { if (off_out) return -ESPIPE; pipe_clear_nowait(out); } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static ssize_t iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; ssize_t left; size_t start; int i, n; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { ret = left; break; } n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { int size = umin(left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { iov_iter_revert(from, left); // this one got dropped by add_to_pipe() while (++i < n) put_page(pages[i]); goto out; } total += ret; left -= size; start = 0; } } out: return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; ssize_t ret = 0; if (!pipe) return -EBADF; pipe_clear_nowait(file); if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } if (ret > 0) fsnotify_access(file); return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_clear_nowait(file); pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) { wakeup_pipe_readers(pipe); fsnotify_modify(file); } return ret; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_WRITE) type = ITER_SOURCE; else if (fd_file(f)->f_mode & FMODE_READ) type = ITER_DEST; else return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) return error; if (!iov_iter_count(&iter)) error = 0; else if (type == ITER_SOURCE) error = vmsplice_to_pipe(fd_file(f), &iter, flags); else error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, in)(fd_in); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fd_out); if (fd_empty(out)) return -EBADF; return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } if (ret > 0) { fsnotify_access(in); fsnotify_modify(out); } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; CLASS(fd, in)(fdin); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fdout); if (fd_empty(out)) return -EBADF; return do_tee(fd_file(in), fd_file(out), len, flags); }
1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 32 1 32 4 7 18 4 16 2 14 2 11 3 8 3 6 2 5 1 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/genetlink.h> #include "br_private.h" #include "br_private_cfm.h" static const struct nla_policy br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7), [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = { .type = NLA_BINARY, .len = CFM_MAID_LENGTH }, }; static const struct nla_policy br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF), }; static const struct nla_policy br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 }, }; static const struct nla_policy br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = { [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR, [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 }, [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 }, }; static const struct nla_policy br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = { [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_CFM_MEP_CREATE] = NLA_POLICY_NESTED(br_cfm_mep_create_policy), [IFLA_BRIDGE_CFM_MEP_DELETE] = NLA_POLICY_NESTED(br_cfm_mep_delete_policy), [IFLA_BRIDGE_CFM_MEP_CONFIG] = NLA_POLICY_NESTED(br_cfm_mep_config_policy), [IFLA_BRIDGE_CFM_CC_CONFIG] = NLA_POLICY_NESTED(br_cfm_cc_config_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] = NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy), [IFLA_BRIDGE_CFM_CC_RDI] = NLA_POLICY_NESTED(br_cfm_cc_rdi_policy), [IFLA_BRIDGE_CFM_CC_CCM_TX] = NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy), }; static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1]; struct br_cfm_mep_create create; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr, br_cfm_mep_create_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) { NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) { NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) { NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute"); return -EINVAL; } memset(&create, 0, sizeof(create)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]); create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]); create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]); create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]); return br_cfm_mep_create(br, instance, &create, extack); } static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1]; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr, br_cfm_mep_delete_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]); return br_cfm_mep_delete(br, instance, extack); } static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1]; struct br_cfm_mep_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr, br_cfm_mep_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) { NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]); nla_memcpy(&config.unicast_mac.addr, tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC], sizeof(config.unicast_mac.addr)); config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]); config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]); return br_cfm_mep_config_set(br, instance, &config, extack); } static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1]; struct br_cfm_cc_config config; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr, br_cfm_cc_config_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) { NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) { NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) { NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute"); return -EINVAL; } memset(&config, 0, sizeof(config)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]); config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]); config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]); nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID], sizeof(config.exp_maid.data)); return br_cfm_cc_config_set(br, instance, &config, extack); } static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack); } static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1]; u32 instance, peer_mep_id; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr, br_cfm_cc_peer_mep_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) { NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]); peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]); return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack); } static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1]; u32 instance, rdi; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr, br_cfm_cc_rdi_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) { NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute"); return -EINVAL; } instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]); rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]); return br_cfm_cc_rdi_set(br, instance, rdi, extack); } static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1]; struct br_cfm_cc_ccm_tx_info tx_info; u32 instance; int err; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr, br_cfm_cc_ccm_tx_policy, extack); if (err) return err; if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) { NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) { NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) { NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) { NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute"); return -EINVAL; } if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) { NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute"); return -EINVAL; } memset(&tx_info, 0, sizeof(tx_info)); instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]); nla_memcpy(&tx_info.dmac.addr, tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC], sizeof(tx_info.dmac.addr)); tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]); tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]); tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]); tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]); tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]); tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]); return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack); } int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1]; int err; /* When this function is called for a port then the br pointer is * invalid, therefor set the br to point correctly */ if (p) br = p->br; err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr, br_cfm_policy, extack); if (err) return err; if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) { err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) { err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) { err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) { err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) { err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) { err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_RDI]) { err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI], extack); if (err) return err; } if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) { err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX], extack); if (err) return err; } return 0; } int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN, mep->create.domain)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION, mep->create.direction)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX, mep->create.ifindex)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC, sizeof(mep->config.unicast_mac.addr), mep->config.unicast_mac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL, mep->config.mdlevel)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID, mep->config.mepid)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE, mep->cc_config.enable)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL, mep->cc_config.exp_interval)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID, sizeof(mep->cc_config.exp_maid.data), mep->cc_config.exp_maid.data)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI, mep->rdi)) goto nla_put_failure; nla_nest_end(skb, tb); tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC, sizeof(mep->cc_ccm_tx_info.dmac), mep->cc_ccm_tx_info.dmac.addr)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE, mep->cc_ccm_tx_info.seq_no_update)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD, mep->cc_ccm_tx_info.period)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV, mep->cc_ccm_tx_info.if_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE, mep->cc_ccm_tx_info.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV, mep->cc_ccm_tx_info.port_tlv)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE, mep->cc_ccm_tx_info.port_tlv_value)) goto nla_put_failure; nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; } int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; struct nlattr *tb; hlist_for_each_entry_rcu(mep, &br->mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN, mep->status.opcode_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN, mep->status.version_unexp_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN, mep->status.rx_level_low_seen)) goto nla_put_failure; /* Only clear if this is a GETLINK */ if (getlink) { /* Clear all 'seen' indications */ mep->status.opcode_unexp_seen = false; mep->status.version_unexp_seen = false; mep->status.rx_level_low_seen = false; } nla_nest_end(skb, tb); hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) { tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO); if (!tb) goto nla_info_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE, mep->instance)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID, peer_mep->mepid)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT, peer_mep->cc_status.ccm_defect)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI, peer_mep->cc_status.rdi)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE, peer_mep->cc_status.port_tlv_value)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE, peer_mep->cc_status.if_tlv_value)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN, peer_mep->cc_status.seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN, peer_mep->cc_status.tlv_seen)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN, peer_mep->cc_status.seq_unexp_seen)) goto nla_put_failure; if (getlink) { /* Only clear if this is a GETLINK */ /* Clear all 'seen' indications */ peer_mep->cc_status.seen = false; peer_mep->cc_status.tlv_seen = false; peer_mep->cc_status.seq_unexp_seen = false; } nla_nest_end(skb, tb); } } return 0; nla_put_failure: nla_nest_cancel(skb, tb); nla_info_failure: return -EMSGSIZE; }
1 31 1 1 2 1 31 1 1 29 1 6 92 38 1 4 29 3 3 1 39 21 40 14 12 42 33 22 10 15 15 6 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ematch.c Extended Match API * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * An extended match (ematch) is a small classification tool not worth * writing a full classifier for. Ematches can be interconnected to form * a logic expression and get attached to classifiers to extend their * functionatlity. * * The userspace part transforms the logic expressions into an array * consisting of multiple sequences of interconnected ematches separated * by markers. Precedence is implemented by a special ematch kind * referencing a sequence beyond the marker of the current sequence * causing the current position in the sequence to be pushed onto a stack * to allow the current position to be overwritten by the position referenced * in the special ematch. Matching continues in the new sequence until a * marker is reached causing the position to be restored from the stack. * * Example: * A AND (B1 OR B2) AND C AND D * * ------->-PUSH------- * -->-- / -->-- \ -->-- * / \ / / \ \ / \ * +-------+-------+-------+-------+-------+--------+ * | A AND | B AND | C AND | D END | B1 OR | B2 END | * +-------+-------+-------+-------+-------+--------+ * \ / * --------<-POP--------- * * where B is a virtual ematch referencing to sequence starting with B1. * * ========================================================================== * * How to write an ematch in 60 seconds * ------------------------------------ * * 1) Provide a matcher function: * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, * struct tcf_pkt_info *info) * { * struct mydata *d = (struct mydata *) m->data; * * if (...matching goes here...) * return 1; * else * return 0; * } * * 2) Fill out a struct tcf_ematch_ops: * static struct tcf_ematch_ops my_ops = { * .kind = unique id, * .datalen = sizeof(struct mydata), * .match = my_match, * .owner = THIS_MODULE, * }; * * 3) Register/Unregister your ematch: * static int __init init_my_ematch(void) * { * return tcf_em_register(&my_ops); * } * * static void __exit exit_my_ematch(void) * { * tcf_em_unregister(&my_ops); * } * * module_init(init_my_ematch); * module_exit(exit_my_ematch); * * 4) By now you should have two more seconds left, barely enough to * open up a beer to watch the compilation going. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; read_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) { if (kind == e->kind) { if (!try_module_get(e->owner)) e = NULL; read_unlock(&ematch_mod_lock); return e; } } read_unlock(&ematch_mod_lock); return NULL; } /** * tcf_em_register - register an extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their presence. * The given @ops must have kind set to a unique identifier and the * callback match() must be implemented. All other callbacks are optional * and a fallback implementation is used instead. * * Returns -EEXISTS if an ematch of the same kind has already registered. */ int tcf_em_register(struct tcf_ematch_ops *ops) { int err = -EEXIST; struct tcf_ematch_ops *e; if (ops->match == NULL) return -EINVAL; write_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) if (ops->kind == e->kind) goto errout; list_add_tail(&ops->link, &ematch_ops); err = 0; errout: write_unlock(&ematch_mod_lock); return err; } EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their disappearance * for examples when the module gets unloaded. The @ops parameter must be * the same as the one used for registration. * * Returns -ENOENT if no matching ematch was found. */ void tcf_em_unregister(struct tcf_ematch_ops *ops) { write_lock(&ematch_mod_lock); list_del(&ops->link); write_unlock(&ematch_mod_lock); } EXPORT_SYMBOL(tcf_em_unregister); static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, int index) { return &tree->matches[index]; } static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index * referencing an external ematch sequence. */ u32 ref; if (data_len < sizeof(ref)) goto errout; ref = *(u32 *) data; if (ref >= tree_hdr->nmatches) goto errout; /* We do not allow backward jumps to avoid loops and jumps * to our own position are of course illegal. */ if (ref <= idx) goto errout; em->data = ref; } else { /* Note: This lookup will increase the module refcnt * of the ematch module referenced. In case of a failure, * a destroy function is called by the underlying layer * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the * module is held if the ops field is non zero. */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { err = -ENOENT; #ifdef CONFIG_MODULES __rtnl_unlock(); request_module("ematch-kind-%u", em_hdr->kind); rtnl_lock(); em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops) { /* We dropped the RTNL mutex in order to * perform the module load. Tell the caller * to replay the request. */ module_put(em->ops->owner); em->ops = NULL; err = -EAGAIN; } #endif goto errout; } /* ematch module provides expected length of data, so we * can do a basic sanity check. */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; if (em->ops->change) { err = -EINVAL; if (em_hdr->flags & TCF_EM_SIMPLE) goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { /* ematch module doesn't provide an own change * procedure and expects us to allocate and copy * the ematch data. * * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (em->ops->datalen > 0) goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; } else { void *v = kmemdup(data, data_len, GFP_KERNEL); if (v == NULL) { err = -ENOBUFS; goto errout; } em->data = (unsigned long) v; } em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->net = net; err = 0; errout: return err; } static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, }; /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, * the changes would not be locked properly. * * Returns a negative error code if the configuration TLV contains errors. */ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { int idx, list_len, matches_len, err; struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; memset(tree, 0, sizeof(*tree)); if (!nla) return 0; err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); if (err < 0) goto errout; err = -EINVAL; rt_hdr = tb[TCA_EMATCH_TREE_HDR]; rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); rt_match = nla_data(rt_list); list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. * * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking * to this policy will result in parsing failure. */ for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); if (err < 0) goto errout_abort; rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to * undefined references during the matching process. */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; } err = 0; errout: return err; errout_abort: tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; if (tree->matches == NULL) return; for (i = 0; i < tree->hdr.nmatches; i++) { struct tcf_ematch *em = tcf_em_get_match(tree, i); if (em->ops) { if (em->ops->destroy) em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); } } tree->hdr.nmatches = 0; kfree(tree->matches); tree->matches = NULL; } EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to * call this function while the ematch tree is in use. * * Returns -1 if the skb tailroom is insufficient. */ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; u8 *tail; struct nlattr *top_start; struct nlattr *list_start; top_start = nla_nest_start_noflag(skb, tlv); if (top_start == NULL) goto nla_put_failure; if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) goto nla_put_failure; list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) goto nla_put_failure; tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, .matchid = em->matchid, .flags = em->flags }; if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) nla_put_nohdr(skb, em->datalen, (void *) em->data); tail = skb_tail_pointer(skb); match_start->nla_len = tail - (u8 *)match_start; } nla_nest_end(skb, list_start); nla_nest_end(skb, top_start); return 0; nla_put_failure: return -1; } EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); return tcf_em_is_inverted(em) ? !r : r; } /* Do not use this function directly, use tcf_em_tree_match instead */ int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { int stackp = 0, match_idx = 0, res = 0; struct tcf_ematch *cur_match; int stack[CONFIG_NET_EMATCH_STACK]; proceed: while (match_idx < tree->hdr.nmatches) { cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_container(cur_match)) { if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) goto stack_overflow; stack[stackp++] = match_idx; match_idx = cur_match->data; goto proceed; } res = tcf_em_match(skb, cur_match, info); if (tcf_em_early_end(cur_match, res)) break; match_idx++; } pop_stack: if (stackp > 0) { match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_inverted(cur_match)) res = !res; if (tcf_em_early_end(cur_match, res)) { goto pop_stack; } else { match_idx++; goto proceed; } } return res; stack_overflow: net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match);
2 1 2 2 3 2 2 2 1 1 2 1 1 1 2 9 9 2 3 1 3 3 3 2 1 2 2 2 2 9 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* * USB-to-WWAN Driver for Sierra Wireless modems * * Copyright (C) 2008, 2009, 2010 Paxton Smith, Matthew Safar, Rory Filer * <linux@sierrawireless.com> * * Portions of this based on the cdc_ether driver by David Brownell (2003-2005) * and Ole Andre Vadla Ravnas (ActiveSync) (2006). * * IMPORTANT DISCLAIMER: This driver is not commercially supported by * Sierra Wireless. Use at your own risk. */ #define DRIVER_AUTHOR "Paxton Smith, Matthew Safar, Rory Filer" #define DRIVER_DESC "USB-to-WWAN Driver for Sierra Wireless modems" /* if defined debug messages enabled */ /*#define DEBUG*/ #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/mii.h> #include <linux/sched.h> #include <linux/timer.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <net/ip.h> #include <net/udp.h> #include <linux/unaligned.h> #include <linux/usb/usbnet.h> #define SWI_USB_REQUEST_GET_FW_ATTR 0x06 #define SWI_GET_FW_ATTR_MASK 0x08 /* atomic counter partially included in MAC address to make sure 2 devices * do not end up with the same MAC - concept breaks in case of > 255 ifaces */ static atomic_t iface_counter = ATOMIC_INIT(0); /* * SYNC Timer Delay definition used to set the expiry time */ #define SIERRA_NET_SYNCDELAY (2*HZ) /* Max. MTU supported. The modem buffers are limited to 1500 */ #define SIERRA_NET_MAX_SUPPORTED_MTU 1500 /* The SIERRA_NET_USBCTL_BUF_LEN defines a buffer size allocated for control * message reception ... and thus the max. received packet. * (May be the cause for parse_hip returning -EINVAL) */ #define SIERRA_NET_USBCTL_BUF_LEN 1024 /* Overriding the default usbnet rx_urb_size */ #define SIERRA_NET_RX_URB_SIZE (8 * 1024) /* Private data structure */ struct sierra_net_data { u16 link_up; /* air link up or down */ u8 tx_hdr_template[4]; /* part of HIP hdr for tx'd packets */ u8 sync_msg[4]; /* SYNC message */ u8 shdwn_msg[4]; /* Shutdown message */ /* Backpointer to the container */ struct usbnet *usbnet; u8 ifnum; /* interface number */ /* Bit masks, must be a power of 2 */ #define SIERRA_NET_EVENT_RESP_AVAIL 0x01 #define SIERRA_NET_TIMER_EXPIRY 0x02 unsigned long kevent_flags; struct work_struct sierra_net_kevent; struct timer_list sync_timer; /* For retrying SYNC sequence */ }; struct param { int is_present; union { void *ptr; u32 dword; u16 word; u8 byte; }; }; /* HIP message type */ #define SIERRA_NET_HIP_EXTENDEDID 0x7F #define SIERRA_NET_HIP_HSYNC_ID 0x60 /* Modem -> host */ #define SIERRA_NET_HIP_RESTART_ID 0x62 /* Modem -> host */ #define SIERRA_NET_HIP_MSYNC_ID 0x20 /* Host -> modem */ #define SIERRA_NET_HIP_SHUTD_ID 0x26 /* Host -> modem */ #define SIERRA_NET_HIP_EXT_IP_IN_ID 0x0202 #define SIERRA_NET_HIP_EXT_IP_OUT_ID 0x0002 /* 3G UMTS Link Sense Indication definitions */ #define SIERRA_NET_HIP_LSI_UMTSID 0x78 /* Reverse Channel Grant Indication HIP message */ #define SIERRA_NET_HIP_RCGI 0x64 /* LSI Protocol types */ #define SIERRA_NET_PROTOCOL_UMTS 0x01 #define SIERRA_NET_PROTOCOL_UMTS_DS 0x04 /* LSI Coverage */ #define SIERRA_NET_COVERAGE_NONE 0x00 #define SIERRA_NET_COVERAGE_NOPACKET 0x01 /* LSI Session */ #define SIERRA_NET_SESSION_IDLE 0x00 /* LSI Link types */ #define SIERRA_NET_AS_LINK_TYPE_IPV4 0x00 #define SIERRA_NET_AS_LINK_TYPE_IPV6 0x02 struct lsi_umts { u8 protocol; u8 unused1; __be16 length; /* eventually use a union for the rest - assume umts for now */ u8 coverage; u8 network_len; /* network name len */ u8 network[40]; /* network name (UCS2, bigendian) */ u8 session_state; u8 unused3[33]; } __packed; struct lsi_umts_single { struct lsi_umts lsi; u8 link_type; u8 pdp_addr_len; /* NW-supplied PDP address len */ u8 pdp_addr[16]; /* NW-supplied PDP address (bigendian)) */ u8 unused4[23]; u8 dns1_addr_len; /* NW-supplied 1st DNS address len (bigendian) */ u8 dns1_addr[16]; /* NW-supplied 1st DNS address */ u8 dns2_addr_len; /* NW-supplied 2nd DNS address len */ u8 dns2_addr[16]; /* NW-supplied 2nd DNS address (bigendian)*/ u8 wins1_addr_len; /* NW-supplied 1st Wins address len */ u8 wins1_addr[16]; /* NW-supplied 1st Wins address (bigendian)*/ u8 wins2_addr_len; /* NW-supplied 2nd Wins address len */ u8 wins2_addr[16]; /* NW-supplied 2nd Wins address (bigendian) */ u8 unused5[4]; u8 gw_addr_len; /* NW-supplied GW address len */ u8 gw_addr[16]; /* NW-supplied GW address (bigendian) */ u8 reserved[8]; } __packed; struct lsi_umts_dual { struct lsi_umts lsi; u8 pdp_addr4_len; /* NW-supplied PDP IPv4 address len */ u8 pdp_addr4[4]; /* NW-supplied PDP IPv4 address (bigendian)) */ u8 pdp_addr6_len; /* NW-supplied PDP IPv6 address len */ u8 pdp_addr6[16]; /* NW-supplied PDP IPv6 address (bigendian)) */ u8 unused4[23]; u8 dns1_addr4_len; /* NW-supplied 1st DNS v4 address len (bigendian) */ u8 dns1_addr4[4]; /* NW-supplied 1st DNS v4 address */ u8 dns1_addr6_len; /* NW-supplied 1st DNS v6 address len */ u8 dns1_addr6[16]; /* NW-supplied 1st DNS v6 address (bigendian)*/ u8 dns2_addr4_len; /* NW-supplied 2nd DNS v4 address len (bigendian) */ u8 dns2_addr4[4]; /* NW-supplied 2nd DNS v4 address */ u8 dns2_addr6_len; /* NW-supplied 2nd DNS v6 address len */ u8 dns2_addr6[16]; /* NW-supplied 2nd DNS v6 address (bigendian)*/ u8 unused5[68]; } __packed; #define SIERRA_NET_LSI_COMMON_LEN 4 #define SIERRA_NET_LSI_UMTS_LEN (sizeof(struct lsi_umts_single)) #define SIERRA_NET_LSI_UMTS_STATUS_LEN \ (SIERRA_NET_LSI_UMTS_LEN - SIERRA_NET_LSI_COMMON_LEN) #define SIERRA_NET_LSI_UMTS_DS_LEN (sizeof(struct lsi_umts_dual)) #define SIERRA_NET_LSI_UMTS_DS_STATUS_LEN \ (SIERRA_NET_LSI_UMTS_DS_LEN - SIERRA_NET_LSI_COMMON_LEN) /* Our own net device operations structure */ static const struct net_device_ops sierra_net_device_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* get private data associated with passed in usbnet device */ static inline struct sierra_net_data *sierra_net_get_private(struct usbnet *dev) { return (struct sierra_net_data *)dev->data[0]; } /* set private data associated with passed in usbnet device */ static inline void sierra_net_set_private(struct usbnet *dev, struct sierra_net_data *priv) { dev->data[0] = (unsigned long)priv; } /* is packet IPv4/IPv6 */ static inline int is_ip(struct sk_buff *skb) { return skb->protocol == cpu_to_be16(ETH_P_IP) || skb->protocol == cpu_to_be16(ETH_P_IPV6); } /* * check passed in packet and make sure that: * - it is linear (no scatter/gather) * - it is ethernet (mac_header properly set) */ static int check_ethip_packet(struct sk_buff *skb, struct usbnet *dev) { skb_reset_mac_header(skb); /* ethernet header */ if (skb_is_nonlinear(skb)) { netdev_err(dev->net, "Non linear buffer-dropping\n"); return 0; } if (!pskb_may_pull(skb, ETH_HLEN)) return 0; skb->protocol = eth_hdr(skb)->h_proto; return 1; } static const u8 *save16bit(struct param *p, const u8 *datap) { p->is_present = 1; p->word = get_unaligned_be16(datap); return datap + sizeof(p->word); } static const u8 *save8bit(struct param *p, const u8 *datap) { p->is_present = 1; p->byte = *datap; return datap + sizeof(p->byte); } /*----------------------------------------------------------------------------* * BEGIN HIP * *----------------------------------------------------------------------------*/ /* HIP header */ #define SIERRA_NET_HIP_HDR_LEN 4 /* Extended HIP header */ #define SIERRA_NET_HIP_EXT_HDR_LEN 6 struct hip_hdr { int hdrlen; struct param payload_len; struct param msgid; struct param msgspecific; struct param extmsgid; }; static int parse_hip(const u8 *buf, const u32 buflen, struct hip_hdr *hh) { const u8 *curp = buf; int padded; if (buflen < SIERRA_NET_HIP_HDR_LEN) return -EPROTO; curp = save16bit(&hh->payload_len, curp); curp = save8bit(&hh->msgid, curp); curp = save8bit(&hh->msgspecific, curp); padded = hh->msgid.byte & 0x80; hh->msgid.byte &= 0x7F; /* 7 bits */ hh->extmsgid.is_present = (hh->msgid.byte == SIERRA_NET_HIP_EXTENDEDID); if (hh->extmsgid.is_present) { if (buflen < SIERRA_NET_HIP_EXT_HDR_LEN) return -EPROTO; hh->payload_len.word &= 0x3FFF; /* 14 bits */ curp = save16bit(&hh->extmsgid, curp); hh->extmsgid.word &= 0x03FF; /* 10 bits */ hh->hdrlen = SIERRA_NET_HIP_EXT_HDR_LEN; } else { hh->payload_len.word &= 0x07FF; /* 11 bits */ hh->hdrlen = SIERRA_NET_HIP_HDR_LEN; } if (padded) { hh->hdrlen++; hh->payload_len.word--; } /* if real packet shorter than the claimed length */ if (buflen < (hh->hdrlen + hh->payload_len.word)) return -EINVAL; return 0; } static void build_hip(u8 *buf, const u16 payloadlen, struct sierra_net_data *priv) { /* the following doesn't have the full functionality. We * currently build only one kind of header, so it is faster this way */ put_unaligned_be16(payloadlen, buf); memcpy(buf+2, priv->tx_hdr_template, sizeof(priv->tx_hdr_template)); } /*----------------------------------------------------------------------------* * END HIP * *----------------------------------------------------------------------------*/ static int sierra_net_send_cmd(struct usbnet *dev, u8 *cmd, int cmdlen, const char * cmd_name) { struct sierra_net_data *priv = sierra_net_get_private(dev); int status; status = usbnet_write_cmd(dev, USB_CDC_SEND_ENCAPSULATED_COMMAND, USB_DIR_OUT|USB_TYPE_CLASS|USB_RECIP_INTERFACE, 0, priv->ifnum, cmd, cmdlen); if (status != cmdlen && status != -ENODEV) netdev_err(dev->net, "Submit %s failed %d\n", cmd_name, status); return status; } static int sierra_net_send_sync(struct usbnet *dev) { int status; struct sierra_net_data *priv = sierra_net_get_private(dev); dev_dbg(&dev->udev->dev, "%s", __func__); status = sierra_net_send_cmd(dev, priv->sync_msg, sizeof(priv->sync_msg), "SYNC"); return status; } static void sierra_net_set_ctx_index(struct sierra_net_data *priv, u8 ctx_ix) { dev_dbg(&(priv->usbnet->udev->dev), "%s %d", __func__, ctx_ix); priv->tx_hdr_template[0] = 0x3F; priv->tx_hdr_template[1] = ctx_ix; *((__be16 *)&priv->tx_hdr_template[2]) = cpu_to_be16(SIERRA_NET_HIP_EXT_IP_OUT_ID); } static int sierra_net_parse_lsi(struct usbnet *dev, char *data, int datalen) { struct lsi_umts *lsi = (struct lsi_umts *)data; u32 expected_length; if (datalen < sizeof(struct lsi_umts_single)) { netdev_err(dev->net, "%s: Data length %d, exp >= %zu\n", __func__, datalen, sizeof(struct lsi_umts_single)); return -1; } /* Validate the session state */ if (lsi->session_state == SIERRA_NET_SESSION_IDLE) { netdev_err(dev->net, "Session idle, 0x%02x\n", lsi->session_state); return 0; } /* Validate the protocol - only support UMTS for now */ if (lsi->protocol == SIERRA_NET_PROTOCOL_UMTS) { struct lsi_umts_single *single = (struct lsi_umts_single *)lsi; /* Validate the link type */ if (single->link_type != SIERRA_NET_AS_LINK_TYPE_IPV4 && single->link_type != SIERRA_NET_AS_LINK_TYPE_IPV6) { netdev_err(dev->net, "Link type unsupported: 0x%02x\n", single->link_type); return -1; } expected_length = SIERRA_NET_LSI_UMTS_STATUS_LEN; } else if (lsi->protocol == SIERRA_NET_PROTOCOL_UMTS_DS) { expected_length = SIERRA_NET_LSI_UMTS_DS_STATUS_LEN; } else { netdev_err(dev->net, "Protocol unsupported, 0x%02x\n", lsi->protocol); return -1; } if (be16_to_cpu(lsi->length) != expected_length) { netdev_err(dev->net, "%s: LSI_UMTS_STATUS_LEN %d, exp %u\n", __func__, be16_to_cpu(lsi->length), expected_length); return -1; } /* Validate the coverage */ if (lsi->coverage == SIERRA_NET_COVERAGE_NONE || lsi->coverage == SIERRA_NET_COVERAGE_NOPACKET) { netdev_err(dev->net, "No coverage, 0x%02x\n", lsi->coverage); return 0; } /* Set link_sense true */ return 1; } static void sierra_net_handle_lsi(struct usbnet *dev, char *data, struct hip_hdr *hh) { struct sierra_net_data *priv = sierra_net_get_private(dev); int link_up; link_up = sierra_net_parse_lsi(dev, data + hh->hdrlen, hh->payload_len.word); if (link_up < 0) { netdev_err(dev->net, "Invalid LSI\n"); return; } if (link_up) { sierra_net_set_ctx_index(priv, hh->msgspecific.byte); priv->link_up = 1; } else { priv->link_up = 0; } usbnet_link_change(dev, link_up, 0); } static void sierra_net_dosync(struct usbnet *dev) { int status; struct sierra_net_data *priv = sierra_net_get_private(dev); dev_dbg(&dev->udev->dev, "%s", __func__); /* The SIERRA_NET_HIP_MSYNC_ID command appears to request that the * firmware restart itself. After restarting, the modem will respond * with the SIERRA_NET_HIP_RESTART_ID indication. The driver continues * sending MSYNC commands every few seconds until it receives the * RESTART event from the firmware */ /* tell modem we are ready */ status = sierra_net_send_sync(dev); if (status < 0) netdev_err(dev->net, "Send SYNC failed, status %d\n", status); status = sierra_net_send_sync(dev); if (status < 0) netdev_err(dev->net, "Send SYNC failed, status %d\n", status); /* Now, start a timer and make sure we get the Restart Indication */ priv->sync_timer.expires = jiffies + SIERRA_NET_SYNCDELAY; add_timer(&priv->sync_timer); } static void sierra_net_kevent(struct work_struct *work) { struct sierra_net_data *priv = container_of(work, struct sierra_net_data, sierra_net_kevent); struct usbnet *dev = priv->usbnet; int len; int err; u8 *buf; u8 ifnum; if (test_bit(SIERRA_NET_EVENT_RESP_AVAIL, &priv->kevent_flags)) { clear_bit(SIERRA_NET_EVENT_RESP_AVAIL, &priv->kevent_flags); /* Query the modem for the LSI message */ buf = kzalloc(SIERRA_NET_USBCTL_BUF_LEN, GFP_KERNEL); if (!buf) return; ifnum = priv->ifnum; len = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0), USB_CDC_GET_ENCAPSULATED_RESPONSE, USB_DIR_IN|USB_TYPE_CLASS|USB_RECIP_INTERFACE, 0, ifnum, buf, SIERRA_NET_USBCTL_BUF_LEN, USB_CTRL_SET_TIMEOUT); if (len < 0) { netdev_err(dev->net, "usb_control_msg failed, status %d\n", len); } else { struct hip_hdr hh; dev_dbg(&dev->udev->dev, "%s: Received status message," " %04x bytes", __func__, len); err = parse_hip(buf, len, &hh); if (err) { netdev_err(dev->net, "%s: Bad packet," " parse result %d\n", __func__, err); kfree(buf); return; } /* Validate packet length */ if (len != hh.hdrlen + hh.payload_len.word) { netdev_err(dev->net, "%s: Bad packet, received" " %d, expected %d\n", __func__, len, hh.hdrlen + hh.payload_len.word); kfree(buf); return; } /* Switch on received message types */ switch (hh.msgid.byte) { case SIERRA_NET_HIP_LSI_UMTSID: dev_dbg(&dev->udev->dev, "LSI for ctx:%d", hh.msgspecific.byte); sierra_net_handle_lsi(dev, buf, &hh); break; case SIERRA_NET_HIP_RESTART_ID: dev_dbg(&dev->udev->dev, "Restart reported: %d," " stopping sync timer", hh.msgspecific.byte); /* Got sync resp - stop timer & clear mask */ timer_delete_sync(&priv->sync_timer); clear_bit(SIERRA_NET_TIMER_EXPIRY, &priv->kevent_flags); break; case SIERRA_NET_HIP_HSYNC_ID: dev_dbg(&dev->udev->dev, "SYNC received"); err = sierra_net_send_sync(dev); if (err < 0) netdev_err(dev->net, "Send SYNC failed %d\n", err); break; case SIERRA_NET_HIP_EXTENDEDID: netdev_err(dev->net, "Unrecognized HIP msg, " "extmsgid 0x%04x\n", hh.extmsgid.word); break; case SIERRA_NET_HIP_RCGI: /* Ignored */ break; default: netdev_err(dev->net, "Unrecognized HIP msg, " "msgid 0x%02x\n", hh.msgid.byte); break; } } kfree(buf); } /* The sync timer bit might be set */ if (test_bit(SIERRA_NET_TIMER_EXPIRY, &priv->kevent_flags)) { clear_bit(SIERRA_NET_TIMER_EXPIRY, &priv->kevent_flags); dev_dbg(&dev->udev->dev, "Deferred sync timer expiry"); sierra_net_dosync(priv->usbnet); } if (priv->kevent_flags) dev_dbg(&dev->udev->dev, "sierra_net_kevent done, " "kevent_flags = 0x%lx", priv->kevent_flags); } static void sierra_net_defer_kevent(struct usbnet *dev, int work) { struct sierra_net_data *priv = sierra_net_get_private(dev); set_bit(work, &priv->kevent_flags); schedule_work(&priv->sierra_net_kevent); } /* * Sync Retransmit Timer Handler. On expiry, kick the work queue */ static void sierra_sync_timer(struct timer_list *t) { struct sierra_net_data *priv = timer_container_of(priv, t, sync_timer); struct usbnet *dev = priv->usbnet; dev_dbg(&dev->udev->dev, "%s", __func__); /* Kick the tasklet */ sierra_net_defer_kevent(dev, SIERRA_NET_TIMER_EXPIRY); } static void sierra_net_status(struct usbnet *dev, struct urb *urb) { struct usb_cdc_notification *event; dev_dbg(&dev->udev->dev, "%s", __func__); if (urb->actual_length < sizeof *event) return; /* Add cases to handle other standard notifications. */ event = urb->transfer_buffer; switch (event->bNotificationType) { case USB_CDC_NOTIFY_NETWORK_CONNECTION: case USB_CDC_NOTIFY_SPEED_CHANGE: /* USB 305 sends those */ break; case USB_CDC_NOTIFY_RESPONSE_AVAILABLE: sierra_net_defer_kevent(dev, SIERRA_NET_EVENT_RESP_AVAIL); break; default: netdev_err(dev->net, ": unexpected notification %02x!\n", event->bNotificationType); break; } } static u32 sierra_net_get_link(struct net_device *net) { struct usbnet *dev = netdev_priv(net); /* Report link is down whenever the interface is down */ return sierra_net_get_private(dev)->link_up && netif_running(net); } static const struct ethtool_ops sierra_net_ethtool_ops = { .get_drvinfo = usbnet_get_drvinfo, .get_link = sierra_net_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int sierra_net_get_fw_attr(struct usbnet *dev, u16 *datap) { int result = 0; __le16 attrdata; result = usbnet_read_cmd(dev, /* _u8 vendor specific request */ SWI_USB_REQUEST_GET_FW_ATTR, USB_DIR_IN | USB_TYPE_VENDOR, /* __u8 request type */ 0x0000, /* __u16 value not used */ 0x0000, /* __u16 index not used */ &attrdata, /* char *data */ sizeof(attrdata) /* __u16 size */ ); if (result < 0) return -EIO; *datap = le16_to_cpu(attrdata); return result; } /* * collects the bulk endpoints, the status endpoint. */ static int sierra_net_bind(struct usbnet *dev, struct usb_interface *intf) { u8 ifacenum; u8 numendpoints; u16 fwattr = 0; int status; struct sierra_net_data *priv; static const u8 sync_tmplate[sizeof(priv->sync_msg)] = { 0x00, 0x00, SIERRA_NET_HIP_MSYNC_ID, 0x00}; static const u8 shdwn_tmplate[sizeof(priv->shdwn_msg)] = { 0x00, 0x00, SIERRA_NET_HIP_SHUTD_ID, 0x00}; u8 mod[2]; dev_dbg(&dev->udev->dev, "%s", __func__); ifacenum = intf->cur_altsetting->desc.bInterfaceNumber; numendpoints = intf->cur_altsetting->desc.bNumEndpoints; /* We have three endpoints, bulk in and out, and a status */ if (numendpoints != 3) { dev_err(&dev->udev->dev, "Expected 3 endpoints, found: %d", numendpoints); return -ENODEV; } /* Status endpoint set in usbnet_get_endpoints() */ dev->status = NULL; status = usbnet_get_endpoints(dev, intf); if (status < 0) { dev_err(&dev->udev->dev, "Error in usbnet_get_endpoints (%d)", status); return -ENODEV; } if (!dev->status) { dev_err(&dev->udev->dev, "No status endpoint found"); return -ENODEV; } /* Initialize sierra private data */ priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; priv->usbnet = dev; priv->ifnum = ifacenum; dev->net->netdev_ops = &sierra_net_device_ops; /* change MAC addr to include, ifacenum, and to be unique */ mod[0] = atomic_inc_return(&iface_counter); mod[1] = ifacenum; dev_addr_mod(dev->net, ETH_ALEN - 2, mod, 2); /* prepare shutdown message template */ memcpy(priv->shdwn_msg, shdwn_tmplate, sizeof(priv->shdwn_msg)); /* set context index initially to 0 - prepares tx hdr template */ sierra_net_set_ctx_index(priv, 0); /* prepare sync message template */ memcpy(priv->sync_msg, sync_tmplate, sizeof(priv->sync_msg)); /* decrease the rx_urb_size and max_tx_size to 4k on USB 1.1 */ dev->rx_urb_size = SIERRA_NET_RX_URB_SIZE; if (dev->udev->speed != USB_SPEED_HIGH) dev->rx_urb_size = min_t(size_t, 4096, SIERRA_NET_RX_URB_SIZE); dev->net->hard_header_len += SIERRA_NET_HIP_EXT_HDR_LEN; dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len; dev->net->max_mtu = SIERRA_NET_MAX_SUPPORTED_MTU; /* Set up the netdev */ dev->net->flags |= IFF_NOARP; dev->net->ethtool_ops = &sierra_net_ethtool_ops; netif_carrier_off(dev->net); sierra_net_set_private(dev, priv); priv->kevent_flags = 0; /* Use the shared workqueue */ INIT_WORK(&priv->sierra_net_kevent, sierra_net_kevent); /* Only need to do this once */ timer_setup(&priv->sync_timer, sierra_sync_timer, 0); /* verify fw attributes */ status = sierra_net_get_fw_attr(dev, &fwattr); dev_dbg(&dev->udev->dev, "Fw attr: %x\n", fwattr); /* test whether firmware supports DHCP */ if (!(status == sizeof(fwattr) && (fwattr & SWI_GET_FW_ATTR_MASK))) { /* found incompatible firmware version */ dev_err(&dev->udev->dev, "Incompatible driver and firmware" " versions\n"); kfree(priv); return -ENODEV; } return 0; } static void sierra_net_unbind(struct usbnet *dev, struct usb_interface *intf) { int status; struct sierra_net_data *priv = sierra_net_get_private(dev); dev_dbg(&dev->udev->dev, "%s", __func__); /* kill the timer and work */ timer_shutdown_sync(&priv->sync_timer); cancel_work_sync(&priv->sierra_net_kevent); /* tell modem we are going away */ status = sierra_net_send_cmd(dev, priv->shdwn_msg, sizeof(priv->shdwn_msg), "Shutdown"); if (status < 0) netdev_err(dev->net, "usb_control_msg failed, status %d\n", status); usbnet_status_stop(dev); sierra_net_set_private(dev, NULL); kfree(priv); } static struct sk_buff *sierra_net_skb_clone(struct usbnet *dev, struct sk_buff *skb, int len) { struct sk_buff *new_skb; /* clone skb */ new_skb = skb_clone(skb, GFP_ATOMIC); /* remove len bytes from original */ skb_pull(skb, len); /* trim next packet to it's length */ if (new_skb) { skb_trim(new_skb, len); } else { if (netif_msg_rx_err(dev)) netdev_err(dev->net, "failed to get skb\n"); dev->net->stats.rx_dropped++; } return new_skb; } /* ---------------------------- Receive data path ----------------------*/ static int sierra_net_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { int err; struct hip_hdr hh; struct sk_buff *new_skb; dev_dbg(&dev->udev->dev, "%s", __func__); /* could contain multiple packets */ while (likely(skb->len)) { err = parse_hip(skb->data, skb->len, &hh); if (err) { if (netif_msg_rx_err(dev)) netdev_err(dev->net, "Invalid HIP header %d\n", err); /* dev->net->stats.rx_errors incremented by caller */ dev->net->stats.rx_length_errors++; return 0; } /* Validate Extended HIP header */ if (!hh.extmsgid.is_present || hh.extmsgid.word != SIERRA_NET_HIP_EXT_IP_IN_ID) { if (netif_msg_rx_err(dev)) netdev_err(dev->net, "HIP/ETH: Invalid pkt\n"); dev->net->stats.rx_frame_errors++; /* dev->net->stats.rx_errors incremented by caller */ return 0; } skb_pull(skb, hh.hdrlen); /* We are going to accept this packet, prepare it. * In case protocol is IPv6, keep it, otherwise force IPv4. */ skb_reset_mac_header(skb); if (eth_hdr(skb)->h_proto != cpu_to_be16(ETH_P_IPV6)) eth_hdr(skb)->h_proto = cpu_to_be16(ETH_P_IP); eth_zero_addr(eth_hdr(skb)->h_source); memcpy(eth_hdr(skb)->h_dest, dev->net->dev_addr, ETH_ALEN); /* Last packet in batch handled by usbnet */ if (hh.payload_len.word == skb->len) return 1; new_skb = sierra_net_skb_clone(dev, skb, hh.payload_len.word); if (new_skb) usbnet_skb_return(dev, new_skb); } /* while */ return 0; } /* ---------------------------- Transmit data path ----------------------*/ static struct sk_buff *sierra_net_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct sierra_net_data *priv = sierra_net_get_private(dev); u16 len; bool need_tail; BUILD_BUG_ON(sizeof_field(struct usbnet, data) < sizeof(struct cdc_state)); dev_dbg(&dev->udev->dev, "%s", __func__); if (priv->link_up && check_ethip_packet(skb, dev) && is_ip(skb)) { /* enough head room as is? */ if (SIERRA_NET_HIP_EXT_HDR_LEN <= skb_headroom(skb)) { /* Save the Eth/IP length and set up HIP hdr */ len = skb->len; skb_push(skb, SIERRA_NET_HIP_EXT_HDR_LEN); /* Handle ZLP issue */ need_tail = ((len + SIERRA_NET_HIP_EXT_HDR_LEN) % dev->maxpacket == 0); if (need_tail) { if (unlikely(skb_tailroom(skb) == 0)) { netdev_err(dev->net, "tx_fixup:" "no room for packet\n"); dev_kfree_skb_any(skb); return NULL; } else { skb->data[skb->len] = 0; __skb_put(skb, 1); len = len + 1; } } build_hip(skb->data, len, priv); return skb; } else { /* * compensate in the future if necessary */ netdev_err(dev->net, "tx_fixup: no room for HIP\n"); } /* headroom */ } if (!priv->link_up) dev->net->stats.tx_carrier_errors++; /* tx_dropped incremented by usbnet */ /* filter the packet out, release it */ dev_kfree_skb_any(skb); return NULL; } static const struct driver_info sierra_net_info_direct_ip = { .description = "Sierra Wireless USB-to-WWAN Modem", .flags = FLAG_WWAN | FLAG_SEND_ZLP, .bind = sierra_net_bind, .unbind = sierra_net_unbind, .status = sierra_net_status, .rx_fixup = sierra_net_rx_fixup, .tx_fixup = sierra_net_tx_fixup, }; static int sierra_net_probe(struct usb_interface *udev, const struct usb_device_id *prod) { int ret; ret = usbnet_probe(udev, prod); if (ret == 0) { struct usbnet *dev = usb_get_intfdata(udev); ret = usbnet_status_start(dev, GFP_KERNEL); if (ret == 0) { /* Interrupt URB now set up; initiate sync sequence */ sierra_net_dosync(dev); } } return ret; } #define DIRECT_IP_DEVICE(vend, prod) \ {USB_DEVICE_INTERFACE_NUMBER(vend, prod, 7), \ .driver_info = (unsigned long)&sierra_net_info_direct_ip}, \ {USB_DEVICE_INTERFACE_NUMBER(vend, prod, 10), \ .driver_info = (unsigned long)&sierra_net_info_direct_ip}, \ {USB_DEVICE_INTERFACE_NUMBER(vend, prod, 11), \ .driver_info = (unsigned long)&sierra_net_info_direct_ip} static const struct usb_device_id products[] = { DIRECT_IP_DEVICE(0x1199, 0x68A3), /* Sierra Wireless USB-to-WWAN modem */ DIRECT_IP_DEVICE(0x0F3D, 0x68A3), /* AT&T Direct IP modem */ DIRECT_IP_DEVICE(0x1199, 0x68AA), /* Sierra Wireless Direct IP LTE modem */ DIRECT_IP_DEVICE(0x0F3D, 0x68AA), /* AT&T Direct IP LTE modem */ {}, /* last item */ }; MODULE_DEVICE_TABLE(usb, products); /* We are based on usbnet, so let it handle the USB driver specifics */ static struct usb_driver sierra_net_driver = { .name = "sierra_net", .id_table = products, .probe = sierra_net_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .no_dynamic_id = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(sierra_net_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
5 5 3 2 2 2 78 3 3 3 3 2316 2321 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0 /* drivers/net/wireless/virt_wifi.c * * A fake implementation of cfg80211_ops that can be tacked on to an ethernet * net_device to make it appear as a wireless connection. * * Copyright (C) 2018 Google, Inc. * * Author: schuffelen@google.com */ #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/math64.h> #include <linux/module.h> static struct wiphy *common_wiphy; struct virt_wifi_wiphy_priv { struct delayed_work scan_result; struct cfg80211_scan_request *scan_request; bool being_deleted; }; static struct ieee80211_channel channel_2ghz = { .band = NL80211_BAND_2GHZ, .center_freq = 2432, .hw_value = 2432, .max_power = 20, }; static struct ieee80211_rate bitrates_2ghz[] = { { .bitrate = 10 }, { .bitrate = 20 }, { .bitrate = 55 }, { .bitrate = 110 }, { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; static struct ieee80211_supported_band band_2ghz = { .channels = &channel_2ghz, .bitrates = bitrates_2ghz, .band = NL80211_BAND_2GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_2ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, }; static struct ieee80211_channel channel_5ghz = { .band = NL80211_BAND_5GHZ, .center_freq = 5240, .hw_value = 5240, .max_power = 20, }; static struct ieee80211_rate bitrates_5ghz[] = { { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; #define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) #define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) static struct ieee80211_supported_band band_5ghz = { .channels = &channel_5ghz, .bitrates = bitrates_5ghz, .band = NL80211_BAND_5GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_5ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, .vht_cap = { .vht_supported = true, .cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_1 | IEEE80211_VHT_CAP_RXSTBC_2 | IEEE80211_VHT_CAP_RXSTBC_3 | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, .vht_mcs = { .rx_mcs_map = cpu_to_le16(RX_MCS_MAP), .tx_mcs_map = cpu_to_le16(TX_MCS_MAP), } }, }; /* Assigned at module init. Guaranteed locally-administered and unicast. */ static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {}; #define VIRT_WIFI_SSID "VirtWifi" #define VIRT_WIFI_SSID_LEN 8 static void virt_wifi_inform_bss(struct wiphy *wiphy) { u64 tsf = div_u64(ktime_get_boottime_ns(), 1000); struct cfg80211_bss *informed_bss; static const struct { u8 tag; u8 len; u8 ssid[8] __nonstring; } __packed ssid = { .tag = WLAN_EID_SSID, .len = VIRT_WIFI_SSID_LEN, .ssid = VIRT_WIFI_SSID, }; informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz, CFG80211_BSS_FTYPE_PRESP, fake_router_bssid, tsf, WLAN_CAPABILITY_ESS, 0, (void *)&ssid, sizeof(ssid), DBM_TO_MBM(-50), GFP_KERNEL); cfg80211_put_bss(wiphy, informed_bss); } /* Called with the rtnl lock held. */ static int virt_wifi_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); wiphy_debug(wiphy, "scan\n"); if (priv->scan_request || priv->being_deleted) return -EBUSY; priv->scan_request = request; schedule_delayed_work(&priv->scan_result, HZ * 2); return 0; } /* Acquires and releases the rdev BSS lock. */ static void virt_wifi_scan_result(struct work_struct *work) { struct virt_wifi_wiphy_priv *priv = container_of(work, struct virt_wifi_wiphy_priv, scan_result.work); struct wiphy *wiphy = priv_to_wiphy(priv); struct cfg80211_scan_info scan_info = { .aborted = false }; virt_wifi_inform_bss(wiphy); /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } /* May acquire and release the rdev BSS lock. */ static void virt_wifi_cancel_scan(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); cancel_delayed_work_sync(&priv->scan_result); /* Clean up dangling callbacks if necessary. */ if (priv->scan_request) { struct cfg80211_scan_info scan_info = { .aborted = true }; /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } } struct virt_wifi_netdev_priv { struct delayed_work connect; struct net_device *lowerdev; struct net_device *upperdev; u32 tx_packets; u32 tx_failed; u32 connect_requested_ssid_len; u8 connect_requested_ssid[IEEE80211_MAX_SSID_LEN]; u8 connect_requested_bss[ETH_ALEN]; bool is_up; bool is_connected; bool being_deleted; }; /* Called with the rtnl lock held. */ static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); bool could_schedule; if (priv->being_deleted || !priv->is_up) return -EBUSY; if (!sme->ssid) return -EINVAL; priv->connect_requested_ssid_len = sme->ssid_len; memcpy(priv->connect_requested_ssid, sme->ssid, sme->ssid_len); could_schedule = schedule_delayed_work(&priv->connect, HZ * 2); if (!could_schedule) return -EBUSY; if (sme->bssid) { ether_addr_copy(priv->connect_requested_bss, sme->bssid); } else { virt_wifi_inform_bss(wiphy); eth_zero_addr(priv->connect_requested_bss); } wiphy_debug(wiphy, "connect\n"); return 0; } /* Acquires and releases the rdev event lock. */ static void virt_wifi_connect_complete(struct work_struct *work) { struct virt_wifi_netdev_priv *priv = container_of(work, struct virt_wifi_netdev_priv, connect.work); u8 *requested_bss = priv->connect_requested_bss; bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid); bool right_ssid = priv->connect_requested_ssid_len == VIRT_WIFI_SSID_LEN && !memcmp(priv->connect_requested_ssid, VIRT_WIFI_SSID, priv->connect_requested_ssid_len); u16 status = WLAN_STATUS_SUCCESS; if (is_zero_ether_addr(requested_bss)) requested_bss = NULL; if (!priv->is_up || (requested_bss && !right_addr) || !right_ssid) status = WLAN_STATUS_UNSPECIFIED_FAILURE; else priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, priv->is_connected ? fake_router_bssid : NULL, NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } /* May acquire and release the rdev event lock. */ static void virt_wifi_cancel_connect(struct net_device *netdev) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); /* If there is work pending, clean up dangling callbacks. */ if (cancel_delayed_work_sync(&priv->connect)) { /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, priv->connect_requested_bss, NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, GFP_KERNEL); } } /* Called with the rtnl lock held. Acquires the rdev event lock. */ static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev, u16 reason_code) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); if (priv->being_deleted) return -EBUSY; wiphy_debug(wiphy, "disconnect\n"); virt_wifi_cancel_connect(netdev); cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL); priv->is_connected = false; netif_carrier_off(netdev); return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_get_station(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo) { struct net_device *dev = wdev->netdev; struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "get_station\n"); if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid)) return -ENOENT; sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) | BIT_ULL(NL80211_STA_INFO_TX_FAILED) | BIT_ULL(NL80211_STA_INFO_SIGNAL) | BIT_ULL(NL80211_STA_INFO_TX_BITRATE); sinfo->tx_packets = priv->tx_packets; sinfo->tx_failed = priv->tx_failed; /* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */ sinfo->signal = -50; sinfo->txrate = (struct rate_info) { .legacy = 10, /* units are 100kbit/s */ }; return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_dump_station(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(wdev->netdev); wiphy_debug(wiphy, "dump_station\n"); if (idx != 0 || !priv->is_connected) return -ENOENT; ether_addr_copy(mac, fake_router_bssid); return virt_wifi_get_station(wiphy, wdev, fake_router_bssid, sinfo); } static const struct cfg80211_ops virt_wifi_cfg80211_ops = { .scan = virt_wifi_scan, .connect = virt_wifi_connect, .disconnect = virt_wifi_disconnect, .get_station = virt_wifi_get_station, .dump_station = virt_wifi_dump_station, }; /* Acquires and releases the rtnl lock. */ static struct wiphy *virt_wifi_make_wiphy(void) { struct wiphy *wiphy; struct virt_wifi_wiphy_priv *priv; int err; wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv)); if (!wiphy) return NULL; wiphy->max_scan_ssids = 4; wiphy->max_scan_ie_len = 1000; wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; wiphy->bands[NL80211_BAND_2GHZ] = &band_2ghz; wiphy->bands[NL80211_BAND_5GHZ] = &band_5ghz; wiphy->bands[NL80211_BAND_60GHZ] = NULL; wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION); priv = wiphy_priv(wiphy); priv->being_deleted = false; priv->scan_request = NULL; INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result); err = wiphy_register(wiphy); if (err < 0) { wiphy_free(wiphy); return NULL; } return wiphy; } /* Acquires and releases the rtnl lock. */ static void virt_wifi_destroy_wiphy(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv; WARN(!wiphy, "%s called with null wiphy", __func__); if (!wiphy) return; priv = wiphy_priv(wiphy); priv->being_deleted = true; virt_wifi_cancel_scan(wiphy); if (wiphy->registered) wiphy_unregister(wiphy); wiphy_free(wiphy); } /* Enters and exits a RCU-bh critical section. */ static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->tx_packets++; if (!priv->is_connected) { priv->tx_failed++; return NET_XMIT_DROP; } skb->dev = priv->lowerdev; return dev_queue_xmit(skb); } /* Called with rtnl lock held. */ static int virt_wifi_net_device_open(struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->is_up = true; return 0; } /* Called with rtnl lock held. */ static int virt_wifi_net_device_stop(struct net_device *dev) { struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev); n_priv->is_up = false; if (!dev->ieee80211_ptr) return 0; virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_connect(dev); netif_carrier_off(dev); return 0; } static int virt_wifi_net_device_get_iflink(const struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); return READ_ONCE(priv->lowerdev->ifindex); } static const struct net_device_ops virt_wifi_ops = { .ndo_start_xmit = virt_wifi_start_xmit, .ndo_open = virt_wifi_net_device_open, .ndo_stop = virt_wifi_net_device_stop, .ndo_get_iflink = virt_wifi_net_device_get_iflink, }; /* Invoked as part of rtnl lock release. */ static void virt_wifi_net_device_destructor(struct net_device *dev) { /* Delayed past dellink to allow nl80211 to react to the device being * deleted. */ kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; } /* No lock interaction. */ static void virt_wifi_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &virt_wifi_ops; dev->needs_free_netdev = true; } /* Called in a RCU read critical section from netif_receive_skb */ static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct virt_wifi_netdev_priv *priv = rcu_dereference(skb->dev->rx_handler_data); if (!priv->is_connected) return RX_HANDLER_PASS; /* GFP_ATOMIC because this is a packet interrupt handler. */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { dev_err(&priv->upperdev->dev, "can't skb_share_check\n"); return RX_HANDLER_CONSUMED; } *pskb = skb; skb->dev = priv->upperdev; skb->pkt_type = PACKET_HOST; return RX_HANDLER_ANOTHER; } /* Called with rtnl lock held. */ static int virt_wifi_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **tb = params->tb; int err; if (!tb[IFLA_LINK]) return -EINVAL; netif_carrier_off(dev); priv->upperdev = dev; priv->lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!priv->lowerdev) return -ENODEV; if (!tb[IFLA_MTU]) dev->mtu = priv->lowerdev->mtu; else if (dev->mtu > priv->lowerdev->mtu) return -EINVAL; err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler, priv); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_rx_handler_register: %d\n", err); return err; } eth_hw_addr_inherit(dev, priv->lowerdev); netif_stacked_transfer_operstate(priv->lowerdev, dev); SET_NETDEV_DEV(dev, &priv->lowerdev->dev); dev->ieee80211_ptr = kzalloc_obj(*dev->ieee80211_ptr); if (!dev->ieee80211_ptr) { err = -ENOMEM; goto remove_handler; } dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION; dev->ieee80211_ptr->wiphy = common_wiphy; err = register_netdevice(dev); if (err) { dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n", err); goto free_wireless_dev; } err = netdev_upper_dev_link(priv->lowerdev, dev, extack); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n", err); goto unregister_netdev; } dev->priv_destructor = virt_wifi_net_device_destructor; priv->being_deleted = false; priv->is_connected = false; priv->is_up = false; INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete); __module_get(THIS_MODULE); return 0; unregister_netdev: unregister_netdevice(dev); free_wireless_dev: kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; remove_handler: netdev_rx_handler_unregister(priv->lowerdev); return err; } /* Called with rtnl lock held. */ static void virt_wifi_dellink(struct net_device *dev, struct list_head *head) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); if (dev->ieee80211_ptr) virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); priv->being_deleted = true; virt_wifi_cancel_connect(dev); netif_carrier_off(dev); netdev_rx_handler_unregister(priv->lowerdev); netdev_upper_dev_unlink(priv->lowerdev, dev); unregister_netdevice_queue(dev, head); module_put(THIS_MODULE); /* Deleting the wiphy is handled in the module destructor. */ } static struct rtnl_link_ops virt_wifi_link_ops = { .kind = "virt_wifi", .setup = virt_wifi_setup, .newlink = virt_wifi_newlink, .dellink = virt_wifi_dellink, .priv_size = sizeof(struct virt_wifi_netdev_priv), }; static bool netif_is_virt_wifi_dev(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == virt_wifi_rx_handler; } static int virt_wifi_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *lower_dev = netdev_notifier_info_to_dev(ptr); struct virt_wifi_netdev_priv *priv; struct net_device *upper_dev; LIST_HEAD(list_kill); if (!netif_is_virt_wifi_dev(lower_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: priv = rtnl_dereference(lower_dev->rx_handler_data); if (!priv) return NOTIFY_DONE; upper_dev = priv->upperdev; upper_dev->rtnl_link_ops->dellink(upper_dev, &list_kill); unregister_netdevice_many(&list_kill); break; } return NOTIFY_DONE; } static struct notifier_block virt_wifi_notifier = { .notifier_call = virt_wifi_event, }; /* Acquires and releases the rtnl lock. */ static int __init virt_wifi_init_module(void) { int err; /* Guaranteed to be locally-administered and not multicast. */ eth_random_addr(fake_router_bssid); err = register_netdevice_notifier(&virt_wifi_notifier); if (err) return err; err = -ENOMEM; common_wiphy = virt_wifi_make_wiphy(); if (!common_wiphy) goto notifier; err = rtnl_link_register(&virt_wifi_link_ops); if (err) goto destroy_wiphy; return 0; destroy_wiphy: virt_wifi_destroy_wiphy(common_wiphy); notifier: unregister_netdevice_notifier(&virt_wifi_notifier); return err; } /* Acquires and releases the rtnl lock. */ static void __exit virt_wifi_cleanup_module(void) { /* Will delete any devices that depend on the wiphy. */ rtnl_link_unregister(&virt_wifi_link_ops); virt_wifi_destroy_wiphy(common_wiphy); unregister_netdevice_notifier(&virt_wifi_notifier); } module_init(virt_wifi_init_module); module_exit(virt_wifi_cleanup_module); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>"); MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices"); MODULE_ALIAS_RTNL_LINK("virt_wifi");
222 222 2 2 2 2 2 2 1 1 1 11 2 1 7 1 1 2 2 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 // SPDX-License-Identifier: GPL-2.0-only /* * scsi_error.c Copyright (C) 1997 Eric Youngdale * * SCSI error/timeout handling * Initial versions: Eric Youngdale. Based upon conversations with * Leonard Zubkoff and David Miller at Linux Expo, * ideas originating from all over the place. * * Restructured scsi_unjam_host and associated functions. * September 04, 2002 Mike Anderson (andmike@us.ibm.com) * * Forward port of Russell King's (rmk@arm.linux.org.uk) changes and * minor cleanups. * September 30, 2002 Mike Anderson (andmike@us.ibm.com) */ #include <linux/module.h> #include <linux/sched.h> #include <linux/gfp.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/interrupt.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/jiffies.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_common.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_dh.h> #include <scsi/scsi_devinfo.h> #include <scsi/sg.h> #include "scsi_priv.h" #include "scsi_logging.h" #include "scsi_transport_api.h" #include <trace/events/scsi.h> #include <linux/unaligned.h> /* * These should *probably* be handled by the host itself. * Since it is allowed to sleep, it probably should. */ #define BUS_RESET_SETTLE_TIME (10) #define HOST_RESET_SETTLE_TIME (10) static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *, struct scsi_cmnd *); void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy) { lockdep_assert_held(shost->host_lock); if (busy == shost->host_failed) { trace_scsi_eh_wakeup(shost); wake_up_process(shost->ehandler); SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost, "Waking error handler thread\n")); } } /** * scsi_schedule_eh - schedule EH for SCSI host * @shost: SCSI host to invoke error handling on. * * Schedule SCSI EH without scmd. */ void scsi_schedule_eh(struct Scsi_Host *shost) { unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 || scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) { shost->host_eh_scheduled++; scsi_eh_wakeup(shost, scsi_host_busy(shost)); } spin_unlock_irqrestore(shost->host_lock, flags); } EXPORT_SYMBOL_GPL(scsi_schedule_eh); static int scsi_host_eh_past_deadline(struct Scsi_Host *shost) { if (!shost->last_reset || shost->eh_deadline == -1) return 0; /* * 32bit accesses are guaranteed to be atomic * (on all supported architectures), so instead * of using a spinlock we can as well double check * if eh_deadline has been set to 'off' during the * time_before call. */ if (time_before(jiffies, shost->last_reset + shost->eh_deadline) && shost->eh_deadline > -1) return 0; return 1; } static bool scsi_cmd_retry_allowed(struct scsi_cmnd *cmd) { if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return true; return ++cmd->retries <= cmd->allowed; } static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct Scsi_Host *host = sdev->host; if (host->hostt->eh_should_retry_cmd) return host->hostt->eh_should_retry_cmd(cmd); return true; } /** * scmd_eh_abort_handler - Handle command aborts * @work: command to be aborted. * * Note: this function must be called only for a command that has timed out. * Because the block layer marks a request as complete before it calls * scsi_timeout(), a .scsi_done() call from the LLD for a command that has * timed out do not have any effect. Hence it is safe to call * scsi_finish_command() from this function. */ void scmd_eh_abort_handler(struct work_struct *work) { struct scsi_cmnd *scmd = container_of(work, struct scsi_cmnd, abort_work.work); struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; enum scsi_disposition rtn; unsigned long flags; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not aborting\n")); goto out; } SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "aborting command\n")); rtn = scsi_try_to_abort_cmd(shost->hostt, scmd); if (rtn != SUCCESS) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "cmd abort %s\n", (rtn == FAST_IO_FAIL) ? "not send" : "failed")); goto out; } set_host_byte(scmd, DID_TIME_OUT); if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not retrying " "aborted command\n")); goto out; } spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); /* * If the abort succeeds, and there is no further * EH action, clear the ->last_reset time. */ if (list_empty(&shost->eh_abort_list) && list_empty(&shost->eh_cmd_q)) if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); if (!scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "retry aborted command\n")); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); } else { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "finish aborted command\n")); scsi_finish_command(scmd); } return; out: spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_scmd_add(scmd); } /** * scsi_abort_command - schedule a command abort * @scmd: scmd to abort. * * We only need to abort commands after a command timeout */ static int scsi_abort_command(struct scsi_cmnd *scmd) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; unsigned long flags; if (!shost->hostt->eh_abort_handler) { /* No abort handler, fail command directly */ return FAILED; } if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { /* * Retry after abort failed, escalate to next level. */ SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "previous abort failed\n")); BUG_ON(delayed_work_pending(&scmd->abort_work)); return FAILED; } spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; BUG_ON(!list_empty(&scmd->eh_entry)); list_add_tail(&scmd->eh_entry, &shost->eh_abort_list); spin_unlock_irqrestore(shost->host_lock, flags); scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "abort scheduled\n")); queue_delayed_work(shost->tmf_work_q, &scmd->abort_work, HZ / 100); return SUCCESS; } /** * scsi_eh_reset - call into ->eh_action to reset internal counters * @scmd: scmd to run eh on. * * The scsi driver might be carrying internal state about the * devices, so we need to call into the driver to reset the * internal state once the error handler is started. */ static void scsi_eh_reset(struct scsi_cmnd *scmd) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_reset) sdrv->eh_reset(scmd); } } static void scsi_eh_inc_host_failed(struct rcu_head *head) { struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu); struct Scsi_Host *shost = scmd->device->host; unsigned int busy; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); shost->host_failed++; spin_unlock_irqrestore(shost->host_lock, flags); /* * The counting of busy requests needs to occur after adding to * host_failed or after the lock acquire for adding to host_failed * to prevent a race with host unbusy and missing an eh wakeup. */ busy = scsi_host_busy(shost); spin_lock_irqsave(shost->host_lock, flags); scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_scmd_add - add scsi cmd to error handling. * @scmd: scmd to run eh on. */ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) { struct Scsi_Host *shost = scmd->device->host; unsigned long flags; int ret; WARN_ON_ONCE(!shost->ehandler); WARN_ON_ONCE(!test_bit(SCMD_STATE_INFLIGHT, &scmd->state)); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY)) { ret = scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY); WARN_ON_ONCE(ret); } if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; scsi_eh_reset(scmd); list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); spin_unlock_irqrestore(shost->host_lock, flags); /* * Ensure that all tasks observe the host state change before the * host_failed change. */ call_rcu_hurry(&scmd->rcu, scsi_eh_inc_host_failed); } /** * scsi_timeout - Timeout function for normal scsi commands. * @req: request that is timing out. * * Notes: * We do not need to lock this. There is the potential for a race * only in that the normal completion handling might run, but if the * normal completion function determines that the timer has already * fired, then it mustn't do anything. */ enum blk_eh_timer_return scsi_timeout(struct request *req) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct Scsi_Host *host = scmd->device->host; trace_scsi_dispatch_cmd_timeout(scmd); scsi_log_completion(scmd, TIMEOUT_ERROR); atomic_inc(&scmd->device->iotmo_cnt); if (host->eh_deadline != -1 && !host->last_reset) host->last_reset = jiffies; if (host->hostt->eh_timed_out) { switch (host->hostt->eh_timed_out(scmd)) { case SCSI_EH_DONE: return BLK_EH_DONE; case SCSI_EH_RESET_TIMER: return BLK_EH_RESET_TIMER; case SCSI_EH_NOT_HANDLED: break; } } /* * If scsi_done() has already set SCMD_STATE_COMPLETE, do not modify * *scmd. */ if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) return BLK_EH_DONE; atomic_inc(&scmd->device->iodone_cnt); if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); } return BLK_EH_DONE; } /** * scsi_block_when_processing_errors - Prevent cmds from being queued. * @sdev: Device on which we are performing recovery. * * Description: * We block until the host is out of error recovery, and then check to * see whether the host or the device is offline. * * Return value: * 0 when dev was taken offline by error recovery. 1 OK to proceed. */ int scsi_block_when_processing_errors(struct scsi_device *sdev) { int online; wait_event(sdev->host->host_wait, !scsi_host_in_recovery(sdev->host)); online = scsi_device_online(sdev); return online; } EXPORT_SYMBOL(scsi_block_when_processing_errors); #ifdef CONFIG_SCSI_LOGGING /** * scsi_eh_prt_fail_stats - Log info on failures. * @shost: scsi host being recovered. * @work_q: Queue of scsi cmds to process. */ static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost, struct list_head *work_q) { struct scsi_cmnd *scmd; struct scsi_device *sdev; int total_failures = 0; int cmd_failed = 0; int cmd_cancel = 0; int devices_failed = 0; shost_for_each_device(sdev, shost) { list_for_each_entry(scmd, work_q, eh_entry) { if (scmd->device == sdev) { ++total_failures; if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) ++cmd_cancel; else ++cmd_failed; } } if (cmd_cancel || cmd_failed) { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: cmds failed: %d, cancel: %d\n", __func__, cmd_failed, cmd_cancel)); cmd_cancel = 0; cmd_failed = 0; ++devices_failed; } } SCSI_LOG_ERROR_RECOVERY(2, shost_printk(KERN_INFO, shost, "Total of %d commands on %d" " devices require eh work\n", total_failures, devices_failed)); } #endif /** * scsi_report_lun_change - Set flag on all *other* devices on the same target * to indicate that a UNIT ATTENTION is expected. * @sdev: Device reporting the UNIT ATTENTION */ static void scsi_report_lun_change(struct scsi_device *sdev) { sdev->sdev_target->expecting_lun_change = 1; } /** * scsi_report_sense - Examine scsi sense information and log messages for * certain conditions, also issue uevents for some of them. * @sdev: Device reporting the sense code * @sshdr: sshdr to be examined */ static void scsi_report_sense(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr) { enum scsi_device_event evt_type = SDEV_EVT_MAXBITS; /* i.e. none */ if (sshdr->sense_key == UNIT_ATTENTION) { if (sshdr->asc == 0x3f && sshdr->ascq == 0x03) { evt_type = SDEV_EVT_INQUIRY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Inquiry data has changed"); } else if (sshdr->asc == 0x3f && sshdr->ascq == 0x0e) { evt_type = SDEV_EVT_LUN_CHANGE_REPORTED; scsi_report_lun_change(sdev); sdev_printk(KERN_WARNING, sdev, "LUN assignments on this target have " "changed. The Linux SCSI layer does not " "automatically remap LUN assignments.\n"); } else if (sshdr->asc == 0x3f) sdev_printk(KERN_WARNING, sdev, "Operating parameters on this target have " "changed. The Linux SCSI layer does not " "automatically adjust these parameters.\n"); if (sshdr->asc == 0x38 && sshdr->ascq == 0x07) { evt_type = SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED; sdev_printk(KERN_WARNING, sdev, "Warning! Received an indication that the " "LUN reached a thin provisioning soft " "threshold.\n"); } if (sshdr->asc == 0x29) { evt_type = SDEV_EVT_POWER_ON_RESET_OCCURRED; /* * Do not print message if it is an expected side-effect * of runtime PM. */ if (!sdev->silence_suspend) sdev_printk(KERN_WARNING, sdev, "Power-on or device reset occurred\n"); } if (sshdr->asc == 0x2a && sshdr->ascq == 0x01) { evt_type = SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Mode parameters changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x06) { evt_type = SDEV_EVT_ALUA_STATE_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Asymmetric access state changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x09) { evt_type = SDEV_EVT_CAPACITY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Capacity data has changed"); } else if (sshdr->asc == 0x2a) sdev_printk(KERN_WARNING, sdev, "Parameters changed"); } if (evt_type != SDEV_EVT_MAXBITS) { set_bit(evt_type, sdev->pending_events); schedule_work(&sdev->event_work); } } static inline void set_scsi_ml_byte(struct scsi_cmnd *cmd, u8 status) { cmd->result = (cmd->result & 0xffff00ff) | (status << 8); } /** * scsi_check_sense - Examine scsi cmd sense * @scmd: Cmd to have sense checked. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY or ADD_TO_MLQUEUE * * Notes: * When a deferred error is detected the current command has * not been executed and needs retrying. */ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; struct scsi_sense_hdr sshdr; if (! scsi_command_normalize_sense(scmd, &sshdr)) return FAILED; /* no valid sense data */ scsi_report_sense(sdev, &sshdr); if (sshdr.sense_key == UNIT_ATTENTION) { /* * Increment the counters for Power on/Reset or New Media so * that all ULDs interested in these can see that those have * happened, even if someone else gets the sense data. */ if (sshdr.asc == 0x28) atomic_inc(&sdev->ua_new_media_ctr); else if (sshdr.asc == 0x29) atomic_inc(&sdev->ua_por_ctr); } if (scsi_sense_is_deferred(&sshdr)) return NEEDS_RETRY; if (sdev->handler && sdev->handler->check_sense) { enum scsi_disposition rc; rc = sdev->handler->check_sense(sdev, &sshdr); if (rc != SCSI_RETURN_NOT_HANDLED) return rc; /* handler does not care. Drop down to default handling */ } if (scmd->cmnd[0] == TEST_UNIT_READY && scmd->submitter != SUBMITTED_BY_SCSI_ERROR_HANDLER) /* * nasty: for mid-layer issued TURs, we need to return the * actual sense data without any recovery attempt. For eh * issued ones, we need to try to recover and interpret */ return SUCCESS; /* * Previous logic looked for FILEMARK, EOM or ILI which are * mainly associated with tapes and returned SUCCESS. */ if (sshdr.response_code == 0x70) { /* fixed format */ if (scmd->sense_buffer[2] & 0xe0) return SUCCESS; } else { /* * descriptor format: look for "stream commands sense data * descriptor" (see SSC-3). Assume single sense data * descriptor. Ignore ILI from SBC-2 READ LONG and WRITE LONG. */ if ((sshdr.additional_length > 3) && (scmd->sense_buffer[8] == 0x4) && (scmd->sense_buffer[11] & 0xe0)) return SUCCESS; } switch (sshdr.sense_key) { case NO_SENSE: return SUCCESS; case RECOVERED_ERROR: return /* soft_error */ SUCCESS; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF */ return SUCCESS; /* * Check aborts due to command duration limit policy: * ABORTED COMMAND additional sense code with the * COMMAND TIMEOUT BEFORE PROCESSING or * COMMAND TIMEOUT DURING PROCESSING or * COMMAND TIMEOUT DURING PROCESSING DUE TO ERROR RECOVERY * additional sense code qualifiers. */ if (sshdr.asc == 0x2e && sshdr.ascq >= 0x01 && sshdr.ascq <= 0x03) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; return SUCCESS; } if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF) return ADD_TO_MLQUEUE; if (sshdr.asc == 0xc1 && sshdr.ascq == 0x01 && sdev->sdev_bflags & BLIST_RETRY_ASC_C1) return ADD_TO_MLQUEUE; return NEEDS_RETRY; case NOT_READY: case UNIT_ATTENTION: /* * if we are expecting a cc/ua because of a bus reset that we * performed, treat this just as a retry. otherwise this is * information that we should pass up to the upper-level driver * so that we can deal with it there. */ if (scmd->device->expecting_cc_ua) { /* * Because some device does not queue unit * attentions correctly, we carefully check * additional sense code and qualifier so as * not to squash media change unit attention. */ if (sshdr.asc != 0x28 || sshdr.ascq != 0x00) { scmd->device->expecting_cc_ua = 0; return NEEDS_RETRY; } } /* * we might also expect a cc/ua if another LUN on the target * reported a UA with an ASC/ASCQ of 3F 0E - * REPORTED LUNS DATA HAS CHANGED. */ if (scmd->device->sdev_target->expecting_lun_change && sshdr.asc == 0x3f && sshdr.ascq == 0x0e) return NEEDS_RETRY; /* * if the device is in the process of becoming ready, we * should retry. */ if ((sshdr.asc == 0x04) && (sshdr.ascq == 0x01 || sshdr.ascq == 0x0a)) return NEEDS_RETRY; /* * if the device is not started, we need to wake * the error handler to start the motor */ if (scmd->device->allow_restart && (sshdr.asc == 0x04) && (sshdr.ascq == 0x02)) return FAILED; /* * Pass the UA upwards for a determination in the completion * functions. */ return SUCCESS; /* these are not supported */ case DATA_PROTECT: if (sshdr.asc == 0x27 && sshdr.ascq == 0x07) { /* Thin provisioning hard threshold reached */ set_scsi_ml_byte(scmd, SCSIML_STAT_NOSPC); return SUCCESS; } fallthrough; case COPY_ABORTED: case VOLUME_OVERFLOW: case MISCOMPARE: case BLANK_CHECK: set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); return SUCCESS; case MEDIUM_ERROR: if (sshdr.asc == 0x11 || /* UNRECOVERED READ ERR */ sshdr.asc == 0x13 || /* AMNF DATA FIELD */ sshdr.asc == 0x14) { /* RECORD NOT FOUND */ set_scsi_ml_byte(scmd, SCSIML_STAT_MED_ERROR); return SUCCESS; } return NEEDS_RETRY; case HARDWARE_ERROR: if (scmd->device->retry_hwerror) return ADD_TO_MLQUEUE; else set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); fallthrough; case ILLEGAL_REQUEST: if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x22 || /* Invalid function */ sshdr.asc == 0x24 || /* Invalid field in cdb */ sshdr.asc == 0x26 || /* Parameter value invalid */ sshdr.asc == 0x27) { /* Write protected */ set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); } return SUCCESS; case COMPLETED: /* * A command using command duration limits (CDL) with a * descriptor set with policy 0xD may be completed with success * and the sense data DATA CURRENTLY UNAVAILABLE, indicating * that the command was in fact aborted because it exceeded its * duration limit. Never retry these commands. */ if (sshdr.asc == 0x55 && sshdr.ascq == 0x0a) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; } return SUCCESS; default: return SUCCESS; } } EXPORT_SYMBOL_GPL(scsi_check_sense); static void scsi_handle_queue_ramp_up(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sdev->budget_map.map) return; if (!sht->track_queue_depth || sdev->queue_depth >= sdev->max_queue_depth) return; if (time_before(jiffies, sdev->last_queue_ramp_up + sdev->queue_ramp_up_period)) return; if (time_before(jiffies, sdev->last_queue_full_time + sdev->queue_ramp_up_period)) return; /* * Walk all devices of a target and do * ramp up on them. */ shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id || tmp_sdev->queue_depth == sdev->max_queue_depth) continue; scsi_change_queue_depth(tmp_sdev, tmp_sdev->queue_depth + 1); sdev->last_queue_ramp_up = jiffies; } } static void scsi_handle_queue_full(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sht->track_queue_depth) return; shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id) continue; /* * We do not know the number of commands that were at * the device when we got the queue full so we start * from the highest possible value and work our way down. */ scsi_track_queue_full(tmp_sdev, tmp_sdev->queue_depth - 1); } } /** * scsi_eh_completed_normally - Disposition a eh cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status of commands * queued during error recovery. the main difference here is that we * don't allow for the possibility of retries here, and we are a lot * more restrictive about what we consider acceptable. */ static enum scsi_disposition scsi_eh_completed_normally(struct scsi_cmnd *scmd) { /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ if (host_byte(scmd->result) == DID_RESET) { /* * rats. we are already in the error handler, so we now * get to try and figure out what to do next. if the sense * is valid, we have a pretty good idea of what to do. * if not, we mark it as FAILED. */ return scsi_check_sense(scmd); } if (host_byte(scmd->result) != DID_OK) return FAILED; /* * now, check the status byte to see if this indicates * anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_GOOD: scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_CHECK_CONDITION: return scsi_check_sense(scmd); case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: if (scmd->cmnd[0] == TEST_UNIT_READY) /* it is a success, we probed the device and * found it */ return SUCCESS; /* otherwise, we failed to send the command */ return FAILED; case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); fallthrough; case SAM_STAT_BUSY: return NEEDS_RETRY; default: return FAILED; } return FAILED; } /** * scsi_eh_done - Completion function for error handling. * @scmd: Cmd that is done. */ void scsi_eh_done(struct scsi_cmnd *scmd) { struct completion *eh_action; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s result: %x\n", __func__, scmd->result)); eh_action = scmd->device->host->eh_action; if (eh_action) complete(eh_action); } /** * scsi_try_host_reset - ask host adapter to reset itself * @scmd: SCSI cmd to send host reset. */ static enum scsi_disposition scsi_try_host_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, host, "Snd Host RST\n")); if (!hostt->eh_host_reset_handler) return FAILED; rtn = hostt->eh_host_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(HOST_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_reset - ask host to perform a bus reset * @scmd: SCSI cmd to send bus reset. */ static enum scsi_disposition scsi_try_bus_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: Snd Bus RST\n", __func__)); if (!hostt->eh_bus_reset_handler) return FAILED; rtn = hostt->eh_bus_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(BUS_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } static void __scsi_report_device_reset(struct scsi_device *sdev, void *data) { sdev->was_reset = 1; sdev->expecting_cc_ua = 1; } /** * scsi_try_target_reset - Ask host to perform a target reset * @scmd: SCSI cmd used to send a target reset * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_target_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; if (!hostt->eh_target_reset_handler) return FAILED; rtn = hostt->eh_target_reset_handler(scmd); if (rtn == SUCCESS) { spin_lock_irqsave(host->host_lock, flags); __starget_for_each_device(scsi_target(scmd->device), NULL, __scsi_report_device_reset); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_device_reset - Ask host to perform a BDR on a dev * @scmd: SCSI cmd used to send BDR * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_bus_device_reset(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; const struct scsi_host_template *hostt = scmd->device->host->hostt; if (!hostt->eh_device_reset_handler) return FAILED; rtn = hostt->eh_device_reset_handler(scmd); if (rtn == SUCCESS) __scsi_report_device_reset(scmd->device, NULL); return rtn; } /** * scsi_try_to_abort_cmd - Ask host to abort a SCSI command * @hostt: SCSI driver host template * @scmd: SCSI cmd used to send a target reset * * Return value: * SUCCESS, FAILED, or FAST_IO_FAIL * * Notes: * SUCCESS does not necessarily indicate that the command * has been aborted; it only indicates that the LLDDs * has cleared all references to that command. * LLDDs should return FAILED only if an abort was required * but could not be executed. LLDDs should return FAST_IO_FAIL * if the device is temporarily unavailable (eg due to a * link down on FibreChannel) */ static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *hostt, struct scsi_cmnd *scmd) { if (!hostt->eh_abort_handler) return FAILED; return hostt->eh_abort_handler(scmd); } static void scsi_abort_eh_cmnd(struct scsi_cmnd *scmd) { if (scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd) != SUCCESS) if (scsi_try_bus_device_reset(scmd) != SUCCESS) if (scsi_try_target_reset(scmd) != SUCCESS) if (scsi_try_bus_reset(scmd) != SUCCESS) scsi_try_host_reset(scmd); } /** * scsi_eh_prep_cmnd - Save a scsi command info as part of error recovery * @scmd: SCSI command structure to hijack * @ses: structure to save restore information * @cmnd: CDB to send. Can be NULL if no new cmnd is needed * @cmnd_size: size in bytes of @cmnd (must be <= MAX_COMMAND_SIZE) * @sense_bytes: size of sense data to copy. or 0 (if != 0 @cmnd is ignored) * * This function is used to save a scsi command information before re-execution * as part of the error recovery process. If @sense_bytes is 0 the command * sent must be one that does not transfer any data. If @sense_bytes != 0 * @cmnd is ignored and this functions sets up a REQUEST_SENSE command * and cmnd buffers to read @sense_bytes into @scmd->sense_buffer. */ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, unsigned char *cmnd, int cmnd_size, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct request *rq = scsi_cmd_to_rq(scmd); #endif /* * We need saved copies of a number of fields - this is because * error handling may need to overwrite these with different values * to run different commands, and once error handling is complete, * we will need to restore these values prior to running the actual * command. */ ses->cmd_len = scmd->cmd_len; ses->data_direction = scmd->sc_data_direction; ses->sdb = scmd->sdb; ses->result = scmd->result; ses->resid_len = scmd->resid_len; ses->underflow = scmd->underflow; ses->prot_op = scmd->prot_op; ses->eh_eflags = scmd->eh_eflags; scmd->prot_op = SCSI_PROT_NORMAL; scmd->eh_eflags = 0; memcpy(ses->cmnd, scmd->cmnd, sizeof(ses->cmnd)); memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->result = 0; scmd->resid_len = 0; if (sense_bytes) { scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, sense_bytes); sg_init_one(&ses->sense_sgl, scmd->sense_buffer, scmd->sdb.length); scmd->sdb.table.sgl = &ses->sense_sgl; scmd->sc_data_direction = DMA_FROM_DEVICE; scmd->sdb.table.nents = scmd->sdb.table.orig_nents = 1; scmd->cmnd[0] = REQUEST_SENSE; scmd->cmnd[4] = scmd->sdb.length; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } else { scmd->sc_data_direction = DMA_NONE; if (cmnd) { BUG_ON(cmnd_size > sizeof(scmd->cmnd)); memcpy(scmd->cmnd, cmnd, cmnd_size); scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } } scmd->underflow = 0; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN) scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) | (sdev->lun << 5 & 0xe0); /* * Encryption must be disabled for the commands submitted by the error handler. * Hence, clear the encryption context information. */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION ses->rq_crypt_keyslot = rq->crypt_keyslot; ses->rq_crypt_ctx = rq->crypt_ctx; rq->crypt_keyslot = NULL; rq->crypt_ctx = NULL; #endif /* * Zero the sense buffer. The scsi spec mandates that any * untransferred sense data should be interpreted as being zero. */ memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); } EXPORT_SYMBOL(scsi_eh_prep_cmnd); /** * scsi_eh_restore_cmnd - Restore a scsi command info as part of error recovery * @scmd: SCSI command structure to restore * @ses: saved information from a coresponding call to scsi_eh_prep_cmnd * * Undo any damage done by above scsi_eh_prep_cmnd(). */ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct request *rq = scsi_cmd_to_rq(scmd); #endif /* * Restore original data */ scmd->cmd_len = ses->cmd_len; memcpy(scmd->cmnd, ses->cmnd, sizeof(ses->cmnd)); scmd->sc_data_direction = ses->data_direction; scmd->sdb = ses->sdb; scmd->result = ses->result; scmd->resid_len = ses->resid_len; scmd->underflow = ses->underflow; scmd->prot_op = ses->prot_op; scmd->eh_eflags = ses->eh_eflags; #ifdef CONFIG_BLK_INLINE_ENCRYPTION rq->crypt_keyslot = ses->rq_crypt_keyslot; rq->crypt_ctx = ses->rq_crypt_ctx; #endif } EXPORT_SYMBOL(scsi_eh_restore_cmnd); /** * scsi_send_eh_cmnd - submit a scsi command as part of error recovery * @scmd: SCSI command structure to hijack * @cmnd: CDB to send * @cmnd_size: size in bytes of @cmnd * @timeout: timeout for this request * @sense_bytes: size of sense data to copy or 0 * * This function is used to send a scsi command down to a target device * as part of the error recovery process. See also scsi_eh_prep_cmnd() above. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY */ static enum scsi_disposition scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd, int cmnd_size, int timeout, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; DECLARE_COMPLETION_ONSTACK(done); unsigned long timeleft = timeout, delay; struct scsi_eh_save ses; const unsigned long stall_for = msecs_to_jiffies(100); int rtn; retry: scsi_eh_prep_cmnd(scmd, &ses, cmnd, cmnd_size, sense_bytes); shost->eh_action = &done; scsi_log_send(scmd); scmd->submitter = SUBMITTED_BY_SCSI_ERROR_HANDLER; scmd->flags |= SCMD_LAST; /* * Lock sdev->state_mutex to avoid that scsi_device_quiesce() can * change the SCSI device state after we have examined it and before * .queuecommand() is called. */ mutex_lock(&sdev->state_mutex); while (sdev->sdev_state == SDEV_BLOCK && timeleft > 0) { mutex_unlock(&sdev->state_mutex); SCSI_LOG_ERROR_RECOVERY(5, sdev_printk(KERN_DEBUG, sdev, "%s: state %d <> %d\n", __func__, sdev->sdev_state, SDEV_BLOCK)); delay = min(timeleft, stall_for); timeleft -= delay; msleep(jiffies_to_msecs(delay)); mutex_lock(&sdev->state_mutex); } if (sdev->sdev_state != SDEV_BLOCK) rtn = shost->hostt->queuecommand(shost, scmd); else rtn = FAILED; mutex_unlock(&sdev->state_mutex); if (rtn) { if (timeleft > stall_for) { scsi_eh_restore_cmnd(scmd, &ses); timeleft -= stall_for; msleep(jiffies_to_msecs(stall_for)); goto retry; } /* signal not to enter either branch of the if () below */ timeleft = 0; rtn = FAILED; } else { timeleft = wait_for_completion_timeout(&done, timeout); rtn = SUCCESS; } shost->eh_action = NULL; scsi_log_completion(scmd, rtn); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s timeleft: %ld\n", __func__, timeleft)); /* * If there is time left scsi_eh_done got called, and we will examine * the actual status codes to see whether the command actually did * complete normally, else if we have a zero return and no time left, * the command must still be pending, so abort it and return FAILED. * If we never actually managed to issue the command, because * ->queuecommand() kept returning non zero, use the rtn = FAILED * value above (so don't execute either branch of the if) */ if (timeleft) { rtn = scsi_eh_completed_normally(scmd); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: scsi_eh_completed_normally %x\n", __func__, rtn)); switch (rtn) { case SUCCESS: case NEEDS_RETRY: case FAILED: break; case ADD_TO_MLQUEUE: rtn = NEEDS_RETRY; break; default: rtn = FAILED; break; } } else if (rtn != FAILED) { scsi_abort_eh_cmnd(scmd); rtn = FAILED; } scsi_eh_restore_cmnd(scmd, &ses); return rtn; } /** * scsi_request_sense - Request sense data from a particular target. * @scmd: SCSI cmd for request sense. * * Notes: * Some hosts automatically obtain this information, others require * that we obtain it on our own. This function will *not* return until * the command either times out, or it completes. */ static enum scsi_disposition scsi_request_sense(struct scsi_cmnd *scmd) { return scsi_send_eh_cmnd(scmd, NULL, 0, scmd->device->eh_timeout, ~0); } static enum scsi_disposition scsi_eh_action(struct scsi_cmnd *scmd, enum scsi_disposition rtn) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) rtn = sdrv->eh_action(scmd, rtn); } return rtn; } /** * scsi_eh_finish_cmd - Handle a cmd that eh is finished with. * @scmd: Original SCSI cmd that eh has finished. * @done_q: Queue for processed commands. * * Notes: * We don't want to use the normal command completion while we are are * still handling errors - it may cause other commands to be queued, * and that would disturb what we are doing. Thus we really want to * keep a list of pending commands for final completion, and once we * are ready to leave error handling we handle completion for real. */ void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q) { list_move_tail(&scmd->eh_entry, done_q); } EXPORT_SYMBOL(scsi_eh_finish_cmd); /** * scsi_eh_get_sense - Get device sense data. * @work_q: Queue of commands to process. * @done_q: Queue of processed commands. * * Description: * See if we need to request sense information. if so, then get it * now, so we have a better idea of what to do. * * Notes: * This has the unfortunate side effect that if a shost adapter does * not automatically request sense information, we end up shutting * it down before we request it. * * All drivers should request sense information internally these days, * so for now all I have to say is tough noogies if you end up in here. * * XXX: Long term this code should go away, but that needs an audit of * all LLDDs first. */ int scsi_eh_get_sense(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct Scsi_Host *shost; enum scsi_disposition rtn; /* * If SCSI_EH_ABORT_SCHEDULED has been set, it is timeout IO, * should not get sense. */ list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if ((scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) || SCSI_SENSE_VALID(scmd)) continue; shost = scmd->device->host; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: skip request sense, past eh deadline\n", current->comm)); break; } if (!scsi_status_is_check_condition(scmd->result)) /* * don't request sense if there's no check condition * status because the error we're processing isn't one * that has a sense code (and some devices get * confused by sense requests out of the blue) */ continue; SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd, "%s: requesting sense\n", current->comm)); rtn = scsi_request_sense(scmd); if (rtn != SUCCESS) continue; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "sense requested, result %x\n", scmd->result)); SCSI_LOG_ERROR_RECOVERY(3, scsi_print_sense(scmd)); rtn = scsi_decide_disposition(scmd); /* * if the result was normal, then just pass it along to the * upper level. */ if (rtn == SUCCESS) /* * We don't want this command reissued, just finished * with the sense data, so set retries to the max * allowed to ensure it won't get reissued. If the user * has requested infinite retries, we also want to * finish this command, so force completion by setting * retries and allowed to the same value. */ if (scmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) scmd->retries = scmd->allowed = 1; else scmd->retries = scmd->allowed; else if (rtn != NEEDS_RETRY) continue; scsi_eh_finish_cmd(scmd, done_q); } return list_empty(work_q); } EXPORT_SYMBOL_GPL(scsi_eh_get_sense); /** * scsi_eh_tur - Send TUR to device. * @scmd: &scsi_cmnd to send TUR * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_tur(struct scsi_cmnd *scmd) { static unsigned char tur_command[6] = {TEST_UNIT_READY, 0, 0, 0, 0, 0}; int retry_cnt = 1; enum scsi_disposition rtn; retry_tur: rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, scmd->device->eh_timeout, 0); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s return: %x\n", __func__, rtn)); switch (rtn) { case NEEDS_RETRY: if (retry_cnt--) goto retry_tur; fallthrough; case SUCCESS: return 0; default: return 1; } } /** * scsi_eh_test_devices - check if devices are responding from error recovery. * @cmd_list: scsi commands in error recovery. * @work_q: queue for commands which still need more error recovery * @done_q: queue for commands which are finished * @try_stu: boolean on if a STU command should be tried in addition to TUR. * * Decription: * Tests if devices are in a working state. Commands to devices now in * a working state are sent to the done_q while commands to devices which * are still failing to respond are returned to the work_q for more * processing. **/ static int scsi_eh_test_devices(struct list_head *cmd_list, struct list_head *work_q, struct list_head *done_q, int try_stu) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; int finish_cmds; while (!list_empty(cmd_list)) { scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry); sdev = scmd->device; if (!try_stu) { if (scsi_host_eh_past_deadline(sdev->host)) { /* Push items back onto work_q */ list_splice_init(cmd_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip test device, past eh deadline", current->comm)); break; } } finish_cmds = !scsi_device_online(scmd->device) || (try_stu && !scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) || !scsi_eh_tur(scmd); list_for_each_entry_safe(scmd, next, cmd_list, eh_entry) if (scmd->device == sdev) { if (finish_cmds && (try_stu || scsi_eh_action(scmd, SUCCESS) == SUCCESS)) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, work_q); } } return list_empty(work_q); } /** * scsi_eh_try_stu - Send START_UNIT to device. * @scmd: &scsi_cmnd to send START_UNIT * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_try_stu(struct scsi_cmnd *scmd) { static unsigned char stu_command[6] = {START_STOP, 0, 0, 0, 1, 0}; if (scmd->device->allow_restart) { int i; enum scsi_disposition rtn = NEEDS_RETRY; for (i = 0; rtn == NEEDS_RETRY && i < 2; i++) rtn = scsi_send_eh_cmnd(scmd, stu_command, 6, scmd->device->eh_timeout, 0); if (rtn == SUCCESS) return 0; } return 1; } /** * scsi_eh_stu - send START_UNIT if needed * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * If commands are failing due to not ready, initializing command required, * try revalidating the device, which will end up sending a start unit. */ static int scsi_eh_stu(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *stu_scmd, *next; struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip START_UNIT, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } stu_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) && scsi_check_sense(scmd) == FAILED ) { stu_scmd = scmd; break; } if (!stu_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending START_UNIT\n", current->comm)); if (!scsi_eh_try_stu(stu_scmd)) { if (!scsi_device_online(sdev) || !scsi_eh_tur(stu_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, SUCCESS) == SUCCESS) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: START_UNIT failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_bus_device_reset - send bdr if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a bus device reset. Still, look to see whether we have multiple * devices that are jammed or not - if we have multiple devices, it * makes no sense to try bus_device_reset - we really would need to try * a bus_reset instead. */ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *bdr_scmd, *next; struct scsi_device *sdev; enum scsi_disposition rtn; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip BDR, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } bdr_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev) { bdr_scmd = scmd; break; } if (!bdr_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending BDR\n", current->comm)); rtn = scsi_try_bus_device_reset(bdr_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { if (!scsi_device_online(sdev) || rtn == FAST_IO_FAIL || !scsi_eh_tur(bdr_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, rtn) != FAILED) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: BDR failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_target_reset - send target reset if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a target reset. */ static int scsi_eh_target_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { LIST_HEAD(tmp_list); LIST_HEAD(check_list); list_splice_init(work_q, &tmp_list); while (!list_empty(&tmp_list)) { struct scsi_cmnd *next, *scmd; enum scsi_disposition rtn; unsigned int id; if (scsi_host_eh_past_deadline(shost)) { /* push back on work queue for further processing */ list_splice_init(&check_list, work_q); list_splice_init(&tmp_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Skip target reset, past eh deadline\n", current->comm)); return list_empty(work_q); } scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry); id = scmd_id(scmd); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending target reset to target %d\n", current->comm, id)); rtn = scsi_try_target_reset(scmd); if (rtn != SUCCESS && rtn != FAST_IO_FAIL) SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Target reset failed" " target: %d\n", current->comm, id)); list_for_each_entry_safe(scmd, next, &tmp_list, eh_entry) { if (scmd_id(scmd) != id) continue; if (rtn == SUCCESS) list_move_tail(&scmd->eh_entry, &check_list); else if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else /* push back on work queue for further processing */ list_move(&scmd->eh_entry, work_q); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_bus_reset - send a bus reset * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_bus_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *chan_scmd, *next; LIST_HEAD(check_list); unsigned int channel; enum scsi_disposition rtn; /* * we really want to loop over the various channels, and do this on * a channel by channel basis. we should also check to see if any * of the failed commands are on soft_reset devices, and if so, skip * the reset. */ for (channel = 0; channel <= shost->max_channel; channel++) { if (scsi_host_eh_past_deadline(shost)) { list_splice_init(&check_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: skip BRST, past eh deadline\n", current->comm)); return list_empty(work_q); } chan_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { chan_scmd = scmd; break; /* * FIXME add back in some support for * soft_reset devices. */ } } if (!chan_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending BRST chan: %d\n", current->comm, channel)); rtn = scsi_try_bus_reset(chan_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, &check_list); } } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: BRST failed chan: %d\n", current->comm, channel)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_host_reset - send a host reset * @shost: host to be reset. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_host_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; LIST_HEAD(check_list); enum scsi_disposition rtn; if (!list_empty(work_q)) { scmd = list_entry(work_q->next, struct scsi_cmnd, eh_entry); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending HRST\n", current->comm)); rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) { list_splice_init(work_q, &check_list); } else if (rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { scsi_eh_finish_cmd(scmd, done_q); } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: HRST failed\n", current->comm)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 1); } /** * scsi_eh_offline_sdevs - offline scsi devices that fail to recover * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static void scsi_eh_offline_sdevs(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; list_for_each_entry_safe(scmd, next, work_q, eh_entry) { sdev_printk(KERN_INFO, scmd->device, "Device offlined - " "not ready after error recovery\n"); sdev = scmd->device; mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); scsi_eh_finish_cmd(scmd, done_q); } return; } /** * scsi_noretry_cmd - determine if command should be failed fast * @scmd: SCSI cmd to examine. */ bool scsi_noretry_cmd(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); switch (host_byte(scmd->result)) { case DID_OK: break; case DID_TIME_OUT: goto check_type; case DID_BUS_BUSY: return !!(req->cmd_flags & REQ_FAILFAST_TRANSPORT); case DID_PARITY: return !!(req->cmd_flags & REQ_FAILFAST_DEV); case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) return false; fallthrough; case DID_SOFT_ERROR: return !!(req->cmd_flags & REQ_FAILFAST_DRIVER); } /* Never retry commands aborted due to a duration limit timeout */ if (scsi_ml_byte(scmd->result) == SCSIML_STAT_DL_TIMEOUT) return true; if (!scsi_status_is_check_condition(scmd->result)) return false; check_type: /* * assume caller has checked sense and determined * the check condition was retryable. */ if (req->cmd_flags & REQ_FAILFAST_DEV || blk_rq_is_passthrough(req)) return true; return false; } /** * scsi_decide_disposition - Disposition a cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status after sending * out the actual data command. any commands that are queued for error * recovery (e.g. test_unit_ready) do *not* come through here. * * When this routine returns failed, it means the error handler thread * is woken. In cases where the error code indicates an error that * doesn't require the error handler read (i.e. we don't need to * abort/reset), this function should return SUCCESS. */ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; /* * if the device is offline, then we clearly just pass the result back * up to the top level. */ if (!scsi_device_online(scmd->device)) { SCSI_LOG_ERROR_RECOVERY(5, scmd_printk(KERN_INFO, scmd, "%s: device offline - report as SUCCESS\n", __func__)); return SUCCESS; } /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ switch (host_byte(scmd->result)) { case DID_PASSTHROUGH: /* * no matter what, pass this through to the upper layer. * nuke this special code so that it looks like we are saying * did_ok. */ scmd->result &= 0xff00ffff; return SUCCESS; case DID_OK: /* * looks good. drop through, and check the next byte. */ break; case DID_ABORT: if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { set_host_byte(scmd, DID_TIME_OUT); return SUCCESS; } fallthrough; case DID_NO_CONNECT: case DID_BAD_TARGET: /* * note - this means that we just report the status back * to the top level driver, not that we actually think * that it indicates SUCCESS. */ return SUCCESS; case DID_SOFT_ERROR: /* * when the low level driver returns did_soft_error, * it is responsible for keeping an internal retry counter * in order to avoid endless loops (db) */ goto maybe_retry; case DID_IMM_RETRY: return NEEDS_RETRY; case DID_REQUEUE: return ADD_TO_MLQUEUE; case DID_TRANSPORT_DISRUPTED: /* * LLD/transport was disrupted during processing of the IO. * The transport class is now blocked/blocking, * and the transport will decide what to do with the IO * based on its timers and recovery capablilities if * there are enough retries. */ goto maybe_retry; case DID_TRANSPORT_FAILFAST: /* * The transport decided to failfast the IO (most likely * the fast io fail tmo fired), so send IO directly upwards. */ return SUCCESS; case DID_TRANSPORT_MARGINAL: /* * caller has decided not to do retries on * abort success, so send IO directly upwards */ return SUCCESS; case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) /* * execute reservation conflict processing code * lower down */ break; fallthrough; case DID_BUS_BUSY: case DID_PARITY: goto maybe_retry; case DID_TIME_OUT: /* * when we scan the bus, we get timeout messages for * these commands if there is no device available. * other hosts report did_no_connect for the same thing. */ if ((scmd->cmnd[0] == TEST_UNIT_READY || scmd->cmnd[0] == INQUIRY)) { return SUCCESS; } else { return FAILED; } case DID_RESET: return SUCCESS; default: return FAILED; } /* * check the status byte to see if this indicates anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); /* * the case of trying to send too many commands to a * tagged queueing device. */ fallthrough; case SAM_STAT_BUSY: /* * device can't talk to us at the moment. Should only * occur (SAM-3) when the task queue is empty, so will cause * the empty queue handling to trigger a stall in the * device. */ return ADD_TO_MLQUEUE; case SAM_STAT_GOOD: if (scmd->cmnd[0] == REPORT_LUNS) scmd->device->sdev_target->expecting_lun_change = 0; scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_TASK_ABORTED: goto maybe_retry; case SAM_STAT_CHECK_CONDITION: rtn = scsi_check_sense(scmd); if (rtn == NEEDS_RETRY) goto maybe_retry; /* if rtn == FAILED, we have no sense information; * returning FAILED will wake the error handler thread * to collect the sense and redo the decide * disposition */ return rtn; case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: case SAM_STAT_ACA_ACTIVE: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: sdev_printk(KERN_INFO, scmd->device, "reservation conflict\n"); set_scsi_ml_byte(scmd, SCSIML_STAT_RESV_CONFLICT); return SUCCESS; /* causes immediate i/o error */ } return FAILED; maybe_retry: /* we requeue for retry because the error was retryable, and * the request was not marked fast fail. Note that above, * even if the request is marked fast fail, we still requeue * for queue congestion conditions (QUEUE_FULL or BUSY) */ if (scsi_cmd_retry_allowed(scmd) && !scsi_noretry_cmd(scmd)) { return NEEDS_RETRY; } else { /* * no more retries - report this one back to upper level. */ return SUCCESS; } } static enum rq_end_io_ret eh_lock_door_done(struct request *req, blk_status_t status, const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; } /** * scsi_eh_lock_door - Prevent medium removal for the specified device * @sdev: SCSI device to prevent medium removal * * Locking: * We must be called from process context. * * Notes: * We queue up an asynchronous "ALLOW MEDIUM REMOVAL" request on the * head of the devices request queue, and continue. */ static void scsi_eh_lock_door(struct scsi_device *sdev) { struct scsi_cmnd *scmd; struct request *req; req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return; scmd = blk_mq_rq_to_pdu(req); scmd->cmnd[0] = ALLOW_MEDIUM_REMOVAL; scmd->cmnd[1] = 0; scmd->cmnd[2] = 0; scmd->cmnd[3] = 0; scmd->cmnd[4] = SCSI_REMOVAL_PREVENT; scmd->cmnd[5] = 0; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); scmd->allowed = 5; req->rq_flags |= RQF_QUIET; req->timeout = 10 * HZ; req->end_io = eh_lock_door_done; blk_execute_rq_nowait(req, true); } /** * scsi_restart_operations - restart io operations to the specified host. * @shost: Host we are restarting. * * Notes: * When we entered the error handler, we blocked all further i/o to * this device. we need to 'reverse' this process. */ static void scsi_restart_operations(struct Scsi_Host *shost) { struct scsi_device *sdev; unsigned long flags; /* * If the door was locked, we need to insert a door lock request * onto the head of the SCSI request queue for the device. There * is no point trying to lock the door of an off-line device. */ shost_for_each_device(sdev, shost) { if (scsi_device_online(sdev) && sdev->was_reset && sdev->locked) { scsi_eh_lock_door(sdev); sdev->was_reset = 0; } } /* * next free up anything directly waiting upon the host. this * will be requests for character device operations, and also for * ioctls to queued block devices. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart\n")); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RUNNING)) if (scsi_host_set_state(shost, SHOST_CANCEL)) BUG_ON(scsi_host_set_state(shost, SHOST_DEL)); spin_unlock_irqrestore(shost->host_lock, flags); wake_up(&shost->host_wait); /* * finally we need to re-initiate requests that may be pending. we will * have had everything blocked while error handling is taking place, and * now that error recovery is done, we will need to ensure that these * requests are started. */ scsi_run_host_queues(shost); /* * if eh is active and host_eh_scheduled is pending we need to re-run * recovery. we do this check after scsi_run_host_queues() to allow * everything pent up since the last eh run a chance to make forward * progress before we sync again. Either we'll immediately re-run * recovery or scsi_device_unbusy() will wake us again when these * pending commands complete. */ spin_lock_irqsave(shost->host_lock, flags); if (shost->host_eh_scheduled) if (scsi_host_set_state(shost, SHOST_RECOVERY)) WARN_ON(scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_ready_devs - check device ready state and recover if not. * @shost: host to be recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ void scsi_eh_ready_devs(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { if (!scsi_eh_stu(shost, work_q, done_q)) if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) if (!scsi_eh_target_reset(shost, work_q, done_q)) if (!scsi_eh_bus_reset(shost, work_q, done_q)) if (!scsi_eh_host_reset(shost, work_q, done_q)) scsi_eh_offline_sdevs(work_q, done_q); } EXPORT_SYMBOL_GPL(scsi_eh_ready_devs); /** * scsi_eh_flush_done_q - finish processed commands or retry them. * @done_q: list_head of processed commands. */ void scsi_eh_flush_done_q(struct list_head *done_q) { struct scsi_cmnd *scmd, *next; list_for_each_entry_safe(scmd, next, done_q, eh_entry) { struct scsi_device *sdev = scmd->device; list_del_init(&scmd->eh_entry); if (scsi_device_online(sdev) && !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush retry cmd\n", current->comm)); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); blk_mq_kick_requeue_list(sdev->request_queue); } else { /* * If just we got sense for the device (called * scsi_eh_get_sense), scmd->result is already * set, do not set DID_TIME_OUT. */ if (!scmd->result && !(scmd->flags & SCMD_FORCE_EH_SUCCESS)) scmd->result |= (DID_TIME_OUT << 16); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush finish cmd\n", current->comm)); scsi_finish_command(scmd); } } } EXPORT_SYMBOL(scsi_eh_flush_done_q); /** * scsi_unjam_host - Attempt to fix a host which has a cmd that failed. * @shost: Host to unjam. * * Notes: * When we come in here, we *know* that all commands on the bus have * either completed, failed or timed out. we also know that no further * commands are being sent to the host, so things are relatively quiet * and we have freedom to fiddle with things as we wish. * * This is only the *default* implementation. it is possible for * individual drivers to supply their own version of this function, and * if the maintainer wishes to do this, it is strongly suggested that * this function be taken as a template and modified. this function * was designed to correctly handle problems for about 95% of the * different cases out there, and it should always provide at least a * reasonable amount of error recovery. * * Any command marked 'failed' or 'timeout' must eventually have * scsi_finish_cmd() called for it. we do all of the retry stuff * here, so when we restart the host after we return it should have an * empty queue. */ static void scsi_unjam_host(struct Scsi_Host *shost) { unsigned long flags; LIST_HEAD(eh_work_q); LIST_HEAD(eh_done_q); spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(shost->host_lock, flags); SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q)); if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q)) scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q); spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_flush_done_q(&eh_done_q); } /** * scsi_error_handler - SCSI error handler thread * @data: Host for which we are running. * * Notes: * This is the main error handling loop. This is run as a kernel thread * for every SCSI host and handles all error handling activity. */ int scsi_error_handler(void *data) { struct Scsi_Host *shost = data; /* * We use TASK_INTERRUPTIBLE so that the thread is not * counted against the load average as a running process. * We never actually get interrupted because kthread_run * disables signal delivery for the created thread. */ while (true) { /* * The sequence in kthread_stop() sets the stop flag first * then wakes the process. To avoid missed wakeups, the task * should always be in a non running state before the stop * flag is checked */ set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) || shost->host_failed != scsi_host_busy(shost)) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: sleeping\n", shost->host_no)); schedule(); continue; } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: waking up %d/%d/%d\n", shost->host_no, shost->host_eh_scheduled, shost->host_failed, scsi_host_busy(shost))); /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if (!shost->eh_noresume && scsi_autopm_get_host(shost) != 0) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_ERR, shost, "scsi_eh_%d: unable to autoresume\n", shost->host_no)); continue; } if (shost->transportt->eh_strategy_handler) shost->transportt->eh_strategy_handler(shost); else scsi_unjam_host(shost); /* All scmds have been handled */ shost->host_failed = 0; /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(shost); if (!shost->eh_noresume) scsi_autopm_put_host(shost); } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "Error handler scsi_eh_%d exiting\n", shost->host_no)); shost->ehandler = NULL; return 0; } /** * scsi_report_bus_reset() - report bus reset observed * * Utility function used by low-level drivers to report that * they have observed a bus reset on the bus being handled. * * @shost: Host in question * @channel: channel on which reset was observed. * * Returns: Nothing * * Lock status: Host lock must be held. * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_bus_reset(struct Scsi_Host *shost, int channel) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_bus_reset); /** * scsi_report_device_reset() - report device reset observed * * Utility function used by low-level drivers to report that * they have observed a device reset on the device being handled. * * @shost: Host in question * @channel: channel on which reset was observed * @target: target on which reset was observed * * Returns: Nothing * * Lock status: Host lock must be held * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_device_reset(struct Scsi_Host *shost, int channel, int target) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev) && target == sdev_id(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_device_reset); /** * scsi_ioctl_reset: explicitly reset a host/bus/target/device * @dev: scsi_device to operate on * @arg: reset type (see sg.h) */ int scsi_ioctl_reset(struct scsi_device *dev, int __user *arg) { struct scsi_cmnd *scmd; struct Scsi_Host *shost = dev->host; struct request *rq; unsigned long flags; int error = 0, val; enum scsi_disposition rtn; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; error = get_user(val, arg); if (error) return error; if (scsi_autopm_get_host(shost) < 0) return -EIO; error = -EIO; rq = kzalloc(sizeof(struct request) + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size, GFP_KERNEL); if (!rq) goto out_put_autopm_host; blk_rq_init(NULL, rq); scmd = (struct scsi_cmnd *)(rq + 1); scsi_init_command(dev, scmd); scmd->submitter = SUBMITTED_BY_SCSI_RESET_IOCTL; scmd->flags |= SCMD_LAST; memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->cmd_len = 0; scmd->sc_data_direction = DMA_BIDIRECTIONAL; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 1; spin_unlock_irqrestore(shost->host_lock, flags); switch (val & ~SG_SCSI_RESET_NO_ESCALATE) { case SG_SCSI_RESET_NOTHING: rtn = SUCCESS; break; case SG_SCSI_RESET_DEVICE: rtn = scsi_try_bus_device_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_TARGET: rtn = scsi_try_target_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_BUS: rtn = scsi_try_bus_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_HOST: rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) break; fallthrough; default: rtn = FAILED; break; } error = (rtn == SUCCESS) ? 0 : -EIO; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 0; spin_unlock_irqrestore(shost->host_lock, flags); /* * be sure to wake up anyone who was sleeping or had their queue * suspended while we performed the TMF. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart after TMF\n")); wake_up(&shost->host_wait); scsi_run_host_queues(shost); kfree(rq); out_put_autopm_host: scsi_autopm_put_host(shost); return error; } bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd, struct scsi_sense_hdr *sshdr) { return scsi_normalize_sense(cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, sshdr); } EXPORT_SYMBOL(scsi_command_normalize_sense); /** * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format) * @sense_buffer: byte array of sense data * @sb_len: number of valid bytes in sense_buffer * @info_out: pointer to 64 integer where 8 or 4 byte information * field will be placed if found. * * Return value: * true if information field found, false if not found. */ bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len, u64 *info_out) { const u8 * ucp; if (sb_len < 7) return false; switch (sense_buffer[0] & 0x7f) { case 0x70: case 0x71: if (sense_buffer[0] & 0x80) { *info_out = get_unaligned_be32(&sense_buffer[3]); return true; } return false; case 0x72: case 0x73: ucp = scsi_sense_desc_find(sense_buffer, sb_len, 0 /* info desc */); if (ucp && (0xa == ucp[1])) { *info_out = get_unaligned_be64(&ucp[4]); return true; } return false; default: return false; } } EXPORT_SYMBOL(scsi_get_sense_info_fld);
11 71 67 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 /* * DRBG based on NIST SP800-90A * * Copyright Stephan Mueller <smueller@chronox.de>, 2014 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #ifndef _DRBG_H #define _DRBG_H #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/slab.h> #include <crypto/internal/drbg.h> #include <crypto/internal/rng.h> #include <crypto/rng.h> #include <linux/fips.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/workqueue.h> struct drbg_state; typedef uint32_t drbg_flag_t; struct drbg_core { drbg_flag_t flags; /* flags for the cipher */ __u8 statelen; /* maximum state length */ __u8 blocklen_bytes; /* block size of output in bytes */ char cra_name[CRYPTO_MAX_ALG_NAME]; /* mapping to kernel crypto API */ /* kernel crypto API backend cipher name */ char backend_cra_name[CRYPTO_MAX_ALG_NAME]; }; struct drbg_state_ops { int (*update)(struct drbg_state *drbg, struct list_head *seed, int reseed); int (*generate)(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl); int (*crypto_init)(struct drbg_state *drbg); int (*crypto_fini)(struct drbg_state *drbg); }; struct drbg_test_data { struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */ }; enum drbg_seed_state { DRBG_SEED_STATE_UNSEEDED, DRBG_SEED_STATE_PARTIAL, /* Seeded with !rng_is_initialized() */ DRBG_SEED_STATE_FULL, }; struct drbg_state { struct mutex drbg_mutex; /* lock around DRBG */ unsigned char *V; /* internal state 10.1.1.1 1a) */ unsigned char *Vbuf; /* hash: static value 10.1.1.1 1b) hmac / ctr: key */ unsigned char *C; unsigned char *Cbuf; /* Number of RNG requests since last reseed -- 10.1.1.1 1c) */ size_t reseed_ctr; size_t reseed_threshold; /* some memory the DRBG can use for its operation */ unsigned char *scratchpad; unsigned char *scratchpadbuf; void *priv_data; /* Cipher handle */ struct crypto_skcipher *ctr_handle; /* CTR mode cipher handle */ struct skcipher_request *ctr_req; /* CTR mode request handle */ __u8 *outscratchpadbuf; /* CTR mode output scratchpad */ __u8 *outscratchpad; /* CTR mode aligned outbuf */ struct crypto_wait ctr_wait; /* CTR mode async wait obj */ struct scatterlist sg_in, sg_out; /* CTR mode SGLs */ enum drbg_seed_state seeded; /* DRBG fully seeded? */ unsigned long last_seed_time; bool pr; /* Prediction resistance enabled? */ bool fips_primed; /* Continuous test primed? */ unsigned char *prev; /* FIPS 140-2 continuous test value */ struct crypto_rng *jent; const struct drbg_state_ops *d_ops; const struct drbg_core *core; struct drbg_string test_data; }; static inline __u8 drbg_statelen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->statelen; return 0; } static inline __u8 drbg_blocklen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->blocklen_bytes; return 0; } static inline __u8 drbg_keylen(struct drbg_state *drbg) { if (drbg && drbg->core) return (drbg->core->statelen - drbg->core->blocklen_bytes); return 0; } static inline size_t drbg_max_request_bytes(struct drbg_state *drbg) { /* SP800-90A requires the limit 2**19 bits, but we return bytes */ return (1 << 16); } static inline size_t drbg_max_addtl(struct drbg_state *drbg) { /* SP800-90A requires 2**35 bytes additional info str / pers str */ #if (__BITS_PER_LONG == 32) /* * SP800-90A allows smaller maximum numbers to be returned -- we * return SIZE_MAX - 1 to allow the verification of the enforcement * of this value in drbg_healthcheck_sanity. */ return (SIZE_MAX - 1); #else return (1UL<<35); #endif } static inline size_t drbg_max_requests(struct drbg_state *drbg) { /* SP800-90A requires 2**48 maximum requests before reseeding */ return (1<<20); } /* * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data. * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl) { return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data and * allow furnishing of test_data * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl_test(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_reset() to allow the caller to provide test_data * * @drng DRBG handle -- see crypto_rng_reset * @pers personalization string input buffer * @perslen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_reset */ static inline int crypto_drbg_reset_test(struct crypto_rng *drng, struct drbg_string *pers, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_reset(drng, pers->buf, pers->len); } /* DRBG type flags */ #define DRBG_CTR ((drbg_flag_t)1<<0) #define DRBG_HMAC ((drbg_flag_t)1<<1) #define DRBG_HASH ((drbg_flag_t)1<<2) #define DRBG_TYPE_MASK (DRBG_CTR | DRBG_HMAC | DRBG_HASH) /* DRBG strength flags */ #define DRBG_STRENGTH128 ((drbg_flag_t)1<<3) #define DRBG_STRENGTH192 ((drbg_flag_t)1<<4) #define DRBG_STRENGTH256 ((drbg_flag_t)1<<5) #define DRBG_STRENGTH_MASK (DRBG_STRENGTH128 | DRBG_STRENGTH192 | \ DRBG_STRENGTH256) enum drbg_prefixes { DRBG_PREFIX0 = 0x00, DRBG_PREFIX1, DRBG_PREFIX2, DRBG_PREFIX3 }; #endif /* _DRBG_H */
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H #include <linux/blkdev.h> #include <asm/local.h> struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; }; /* * Macros to operate on percpu disk statistics: * * part_stat_{add|sub|inc|dec}() modify the stat counters and should * be called between part_stat_lock() and part_stat_unlock(). * * part_stat_read() can be called at any time. */ #define part_stat_lock() preempt_disable() #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if (bdev_is_partition(part)) \ __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ part_stat_add(part, field, -1) #define part_stat_inc(part, field) \ part_stat_add(part, field, 1) #define part_stat_sub(part, field, subnd) \ part_stat_add(part, field, -subnd) #define part_stat_local_dec(part, field) \ local_dec(&(part_stat_get(part, field))) #define part_stat_local_inc(part, field) \ local_inc(&(part_stat_get(part, field))) #define part_stat_local_read(part, field) \ local_read(&(part_stat_get(part, field))) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) unsigned int bdev_count_inflight(struct block_device *part); #endif /* _LINUX_PART_STAT_H */
56 37 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
10 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Jean-Fredric Clere, Nikolas Zimmermann, Georg Acher * Mark Cave-Ayland, Carlo E Prelz, Dick Streefland * Copyright (c) 2002, 2003 Tuukka Toivonen * Copyright (c) 2008 Erik Andrén * * P/N 861037: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0010: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0020: Sensor Photobit PB100 ASIC STV0600-1 - QuickCam Express * P/N 861055: Sensor ST VV6410 ASIC STV0610 - LEGO cam * P/N 861075-0040: Sensor HDCS1000 ASIC * P/N 961179-0700: Sensor ST VV6410 ASIC STV0602 - Dexxa WebCam USB * P/N 861040-0000: Sensor ST VV6410 ASIC STV0610 - QuickCam Web */ /* * The spec file for the PB-0100 suggests the following for best quality * images after the sensor has been reset : * * PB_ADCGAINL = R60 = 0x03 (3 dec) : sets low reference of ADC to produce good black level * PB_PREADCTRL = R32 = 0x1400 (5120 dec) : Enables global gain changes through R53 * PB_ADCMINGAIN = R52 = 0x10 (16 dec) : Sets the minimum gain for auto-exposure * PB_ADCGLOBALGAIN = R53 = 0x10 (16 dec) : Sets the global gain * PB_EXPGAIN = R14 = 0x11 (17 dec) : Sets the auto-exposure value * PB_UPDATEINT = R23 = 0x02 (2 dec) : Sets the speed on auto-exposure routine * PB_CFILLIN = R5 = 0x0E (14 dec) : Sets the frame rate */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "stv06xx_pb0100.h" struct pb0100_ctrls { struct { /* one big happy control cluster... */ struct v4l2_ctrl *autogain; struct v4l2_ctrl *gain; struct v4l2_ctrl *exposure; struct v4l2_ctrl *red; struct v4l2_ctrl *blue; struct v4l2_ctrl *natural; }; struct v4l2_ctrl *target; }; static struct v4l2_pix_format pb0100_mode[] = { /* low res / subsample modes disabled as they are only half res horizontal, halving the vertical resolution does not seem to work */ { 320, 240, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .sizeimage = 320 * 240, .bytesperline = 320, .colorspace = V4L2_COLORSPACE_SRGB, .priv = PB0100_CROP_TO_VGA }, { 352, 288, V4L2_PIX_FMT_SGRBG8, V4L2_FIELD_NONE, .sizeimage = 352 * 288, .bytesperline = 352, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0 } }; static int pb0100_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; struct pb0100_ctrls *ctrls = sd->sensor_priv; int err = -EINVAL; switch (ctrl->id) { case V4L2_CID_AUTOGAIN: err = pb0100_set_autogain(gspca_dev, ctrl->val); if (err) break; if (ctrl->val) break; err = pb0100_set_gain(gspca_dev, ctrls->gain->val); if (err) break; err = pb0100_set_exposure(gspca_dev, ctrls->exposure->val); break; case V4L2_CTRL_CLASS_USER + 0x1001: err = pb0100_set_autogain_target(gspca_dev, ctrl->val); break; } return err; } static const struct v4l2_ctrl_ops pb0100_ctrl_ops = { .s_ctrl = pb0100_s_ctrl, }; static int pb0100_init_controls(struct sd *sd) { struct v4l2_ctrl_handler *hdl = &sd->gspca_dev.ctrl_handler; struct pb0100_ctrls *ctrls; static const struct v4l2_ctrl_config autogain_target = { .ops = &pb0100_ctrl_ops, .id = V4L2_CTRL_CLASS_USER + 0x1000, .type = V4L2_CTRL_TYPE_INTEGER, .name = "Automatic Gain Target", .max = 255, .step = 1, .def = 128, }; static const struct v4l2_ctrl_config natural_light = { .ops = &pb0100_ctrl_ops, .id = V4L2_CTRL_CLASS_USER + 0x1001, .type = V4L2_CTRL_TYPE_BOOLEAN, .name = "Natural Light Source", .max = 1, .step = 1, .def = 1, }; ctrls = kzalloc_obj(*ctrls); if (!ctrls) return -ENOMEM; v4l2_ctrl_handler_init(hdl, 6); ctrls->autogain = v4l2_ctrl_new_std(hdl, &pb0100_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); ctrls->exposure = v4l2_ctrl_new_std(hdl, &pb0100_ctrl_ops, V4L2_CID_EXPOSURE, 0, 511, 1, 12); ctrls->gain = v4l2_ctrl_new_std(hdl, &pb0100_ctrl_ops, V4L2_CID_GAIN, 0, 255, 1, 128); ctrls->red = v4l2_ctrl_new_std(hdl, &pb0100_ctrl_ops, V4L2_CID_RED_BALANCE, -255, 255, 1, 0); ctrls->blue = v4l2_ctrl_new_std(hdl, &pb0100_ctrl_ops, V4L2_CID_BLUE_BALANCE, -255, 255, 1, 0); ctrls->natural = v4l2_ctrl_new_custom(hdl, &natural_light, NULL); ctrls->target = v4l2_ctrl_new_custom(hdl, &autogain_target, NULL); if (hdl->error) { kfree(ctrls); return hdl->error; } sd->sensor_priv = ctrls; v4l2_ctrl_auto_cluster(5, &ctrls->autogain, 0, false); return 0; } static int pb0100_probe(struct sd *sd) { u16 sensor; int err; err = stv06xx_read_sensor(sd, PB_IDENT, &sensor); if (err < 0) return -ENODEV; if ((sensor >> 8) != 0x64) return -ENODEV; pr_info("Photobit pb0100 sensor detected\n"); sd->gspca_dev.cam.cam_mode = pb0100_mode; sd->gspca_dev.cam.nmodes = ARRAY_SIZE(pb0100_mode); return 0; } static int pb0100_start(struct sd *sd) { int err, packet_size, max_packet_size; struct usb_host_interface *alt; struct usb_interface *intf; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct cam *cam = &sd->gspca_dev.cam; u32 mode = cam->cam_mode[sd->gspca_dev.curr_mode].priv; intf = usb_ifnum_to_if(sd->gspca_dev.dev, sd->gspca_dev.iface); alt = usb_altnum_to_altsetting(intf, sd->gspca_dev.alt); if (!alt) return -ENODEV; if (alt->desc.bNumEndpoints < 1) return -ENODEV; packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize); /* If we don't have enough bandwidth use a lower framerate */ max_packet_size = sd->sensor->max_packet_size[sd->gspca_dev.curr_mode]; if (packet_size < max_packet_size) stv06xx_write_sensor(sd, PB_ROWSPEED, BIT(4)|BIT(3)|BIT(1)); else stv06xx_write_sensor(sd, PB_ROWSPEED, BIT(5)|BIT(3)|BIT(1)); /* Setup sensor window */ if (mode & PB0100_CROP_TO_VGA) { stv06xx_write_sensor(sd, PB_RSTART, 30); stv06xx_write_sensor(sd, PB_CSTART, 20); stv06xx_write_sensor(sd, PB_RWSIZE, 240 - 1); stv06xx_write_sensor(sd, PB_CWSIZE, 320 - 1); } else { stv06xx_write_sensor(sd, PB_RSTART, 8); stv06xx_write_sensor(sd, PB_CSTART, 4); stv06xx_write_sensor(sd, PB_RWSIZE, 288 - 1); stv06xx_write_sensor(sd, PB_CWSIZE, 352 - 1); } if (mode & PB0100_SUBSAMPLE) { stv06xx_write_bridge(sd, STV_Y_CTRL, 0x02); /* Wrong, FIXME */ stv06xx_write_bridge(sd, STV_X_CTRL, 0x06); stv06xx_write_bridge(sd, STV_SCAN_RATE, 0x10); } else { stv06xx_write_bridge(sd, STV_Y_CTRL, 0x01); stv06xx_write_bridge(sd, STV_X_CTRL, 0x0a); /* larger -> slower */ stv06xx_write_bridge(sd, STV_SCAN_RATE, 0x20); } err = stv06xx_write_sensor(sd, PB_CONTROL, BIT(5)|BIT(3)|BIT(1)); gspca_dbg(gspca_dev, D_STREAM, "Started stream, status: %d\n", err); return (err < 0) ? err : 0; } static int pb0100_stop(struct sd *sd) { struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; int err; err = stv06xx_write_sensor(sd, PB_ABORTFRAME, 1); if (err < 0) goto out; /* Set bit 1 to zero */ err = stv06xx_write_sensor(sd, PB_CONTROL, BIT(5)|BIT(3)); gspca_dbg(gspca_dev, D_STREAM, "Halting stream\n"); out: return (err < 0) ? err : 0; } /* FIXME: Sort the init commands out and put them into tables, this is only for getting the camera to work */ /* FIXME: No error handling for now, add this once the init has been converted to proper tables */ static int pb0100_init(struct sd *sd) { stv06xx_write_bridge(sd, STV_REG00, 1); stv06xx_write_bridge(sd, STV_SCAN_RATE, 0); /* Reset sensor */ stv06xx_write_sensor(sd, PB_RESET, 1); stv06xx_write_sensor(sd, PB_RESET, 0); /* Disable chip */ stv06xx_write_sensor(sd, PB_CONTROL, BIT(5)|BIT(3)); /* Gain stuff...*/ stv06xx_write_sensor(sd, PB_PREADCTRL, BIT(12)|BIT(10)|BIT(6)); stv06xx_write_sensor(sd, PB_ADCGLOBALGAIN, 12); /* Set up auto-exposure */ /* ADC VREF_HI new setting for a transition from the Expose1 to the Expose2 setting */ stv06xx_write_sensor(sd, PB_R28, 12); /* gain max for autoexposure */ stv06xx_write_sensor(sd, PB_ADCMAXGAIN, 180); /* gain min for autoexposure */ stv06xx_write_sensor(sd, PB_ADCMINGAIN, 12); /* Maximum frame integration time (programmed into R8) allowed for auto-exposure routine */ stv06xx_write_sensor(sd, PB_R54, 3); /* Minimum frame integration time (programmed into R8) allowed for auto-exposure routine */ stv06xx_write_sensor(sd, PB_R55, 0); stv06xx_write_sensor(sd, PB_UPDATEINT, 1); /* R15 Expose0 (maximum that auto-exposure may use) */ stv06xx_write_sensor(sd, PB_R15, 800); /* R17 Expose2 (minimum that auto-exposure may use) */ stv06xx_write_sensor(sd, PB_R17, 10); stv06xx_write_sensor(sd, PB_EXPGAIN, 0); /* 0x14 */ stv06xx_write_sensor(sd, PB_VOFFSET, 0); /* 0x0D */ stv06xx_write_sensor(sd, PB_ADCGAINH, 11); /* Set black level (important!) */ stv06xx_write_sensor(sd, PB_ADCGAINL, 0); /* ??? */ stv06xx_write_bridge(sd, STV_REG00, 0x11); stv06xx_write_bridge(sd, STV_REG03, 0x45); stv06xx_write_bridge(sd, STV_REG04, 0x07); /* Scan/timing for the sensor */ stv06xx_write_sensor(sd, PB_ROWSPEED, BIT(4)|BIT(3)|BIT(1)); stv06xx_write_sensor(sd, PB_CFILLIN, 14); stv06xx_write_sensor(sd, PB_VBL, 0); stv06xx_write_sensor(sd, PB_FINTTIME, 0); stv06xx_write_sensor(sd, PB_RINTTIME, 123); stv06xx_write_bridge(sd, STV_REG01, 0xc2); stv06xx_write_bridge(sd, STV_REG02, 0xb0); return 0; } static int pb0100_dump(struct sd *sd) { return 0; } static int pb0100_set_gain(struct gspca_dev *gspca_dev, __s32 val) { int err; struct sd *sd = (struct sd *) gspca_dev; struct pb0100_ctrls *ctrls = sd->sensor_priv; err = stv06xx_write_sensor(sd, PB_G1GAIN, val); if (!err) err = stv06xx_write_sensor(sd, PB_G2GAIN, val); gspca_dbg(gspca_dev, D_CONF, "Set green gain to %d, status: %d\n", val, err); if (!err) err = pb0100_set_red_balance(gspca_dev, ctrls->red->val); if (!err) err = pb0100_set_blue_balance(gspca_dev, ctrls->blue->val); return err; } static int pb0100_set_red_balance(struct gspca_dev *gspca_dev, __s32 val) { int err; struct sd *sd = (struct sd *) gspca_dev; struct pb0100_ctrls *ctrls = sd->sensor_priv; val += ctrls->gain->val; if (val < 0) val = 0; else if (val > 255) val = 255; err = stv06xx_write_sensor(sd, PB_RGAIN, val); gspca_dbg(gspca_dev, D_CONF, "Set red gain to %d, status: %d\n", val, err); return err; } static int pb0100_set_blue_balance(struct gspca_dev *gspca_dev, __s32 val) { int err; struct sd *sd = (struct sd *) gspca_dev; struct pb0100_ctrls *ctrls = sd->sensor_priv; val += ctrls->gain->val; if (val < 0) val = 0; else if (val > 255) val = 255; err = stv06xx_write_sensor(sd, PB_BGAIN, val); gspca_dbg(gspca_dev, D_CONF, "Set blue gain to %d, status: %d\n", val, err); return err; } static int pb0100_set_exposure(struct gspca_dev *gspca_dev, __s32 val) { struct sd *sd = (struct sd *) gspca_dev; int err; err = stv06xx_write_sensor(sd, PB_RINTTIME, val); gspca_dbg(gspca_dev, D_CONF, "Set exposure to %d, status: %d\n", val, err); return err; } static int pb0100_set_autogain(struct gspca_dev *gspca_dev, __s32 val) { int err; struct sd *sd = (struct sd *) gspca_dev; struct pb0100_ctrls *ctrls = sd->sensor_priv; if (val) { if (ctrls->natural->val) val = BIT(6)|BIT(4)|BIT(0); else val = BIT(4)|BIT(0); } else val = 0; err = stv06xx_write_sensor(sd, PB_EXPGAIN, val); gspca_dbg(gspca_dev, D_CONF, "Set autogain to %d (natural: %d), status: %d\n", val, ctrls->natural->val, err); return err; } static int pb0100_set_autogain_target(struct gspca_dev *gspca_dev, __s32 val) { int err, totalpixels, brightpixels, darkpixels; struct sd *sd = (struct sd *) gspca_dev; /* Number of pixels counted by the sensor when subsampling the pixels. * Slightly larger than the real value to avoid oscillation */ totalpixels = gspca_dev->pixfmt.width * gspca_dev->pixfmt.height; totalpixels = totalpixels/(8*8) + totalpixels/(64*64); brightpixels = (totalpixels * val) >> 8; darkpixels = totalpixels - brightpixels; err = stv06xx_write_sensor(sd, PB_R21, brightpixels); if (!err) err = stv06xx_write_sensor(sd, PB_R22, darkpixels); gspca_dbg(gspca_dev, D_CONF, "Set autogain target to %d, status: %d\n", val, err); return err; }
9 9 17 15 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Object management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bug.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include "object.h" struct landlock_object * landlock_create_object(const struct landlock_object_underops *const underops, void *const underobj) { struct landlock_object *new_object; if (WARN_ON_ONCE(!underops || !underobj)) return ERR_PTR(-ENOENT); new_object = kzalloc_obj(*new_object, GFP_KERNEL_ACCOUNT); if (!new_object) return ERR_PTR(-ENOMEM); refcount_set(&new_object->usage, 1); spin_lock_init(&new_object->lock); new_object->underops = underops; new_object->underobj = underobj; return new_object; } /* * The caller must own the object (i.e. thanks to object->usage) to safely put * it. */ void landlock_put_object(struct landlock_object *const object) { /* * The call to @object->underops->release(object) might sleep, e.g. * because of iput(). */ might_sleep(); if (!object) return; /* * If the @object's refcount cannot drop to zero, we can just decrement * the refcount without holding a lock. Otherwise, the decrement must * happen under @object->lock for synchronization with things like * get_inode_object(). */ if (refcount_dec_and_lock(&object->usage, &object->lock)) { __acquire(&object->lock); /* * With @object->lock initially held, remove the reference from * @object->underobj to @object (if it still exists). */ object->underops->release(object); kfree_rcu(object, rcu_free); } }
7 7 7 7 7 7 3 15 15 14 10 14 14 10 4 2 10 10 8 10 10 4 15 4 14 15 4 7 4 1 3 14 14 14 14 4 10 10 15 15 4 11 11 11 4 7 10 1 14 15 7 3 15 15 9 6 9 10 10 200 200 200 200 200 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_iwalk.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_health.h" #include "xfs_trans.h" #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_bit.h" /* * Walking Inodes in the Filesystem * ================================ * * This iterator function walks a subset of filesystem inodes in increasing * order from @startino until there are no more inodes. For each allocated * inode it finds, it calls a walk function with the relevant inode number and * a pointer to caller-provided data. The walk function can return the usual * negative error code to stop the iteration; 0 to continue the iteration; or * -ECANCELED to stop the iteration. This return value is returned to the * caller. * * Internally, we allow the walk function to do anything, which means that we * cannot maintain the inobt cursor or our lock on the AGI buffer. We * therefore cache the inobt records in kernel memory and only call the walk * function when our memory buffer is full. @nr_recs is the number of records * that we've cached, and @sz_recs is the size of our cache. * * It is the responsibility of the walk function to ensure it accesses * allocated inodes, as the inobt records may be stale by the time they are * acted upon. */ struct xfs_iwalk_ag { /* parallel work control data; will be null if single threaded */ struct xfs_pwork pwork; struct xfs_mount *mp; struct xfs_trans *tp; struct xfs_perag *pag; /* Where do we start the traversal? */ xfs_ino_t startino; /* What was the last inode number we saw when iterating the inobt? */ xfs_ino_t lastino; /* Array of inobt records we cache. */ struct xfs_inobt_rec_incore *recs; /* Number of entries allocated for the @recs array. */ unsigned int sz_recs; /* Number of entries in the @recs array that are in use. */ unsigned int nr_recs; /* Inode walk function and data pointer. */ xfs_iwalk_fn iwalk_fn; xfs_inobt_walk_fn inobt_walk_fn; void *data; /* * Make it look like the inodes up to startino are free so that * bulkstat can start its inode iteration at the correct place without * needing to special case everywhere. */ unsigned int trim_start:1; /* Skip empty inobt records? */ unsigned int skip_empty:1; /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ unsigned int drop_trans:1; }; /* * Loop over all clusters in a chunk for a given incore inode allocation btree * record. Do a readahead if there are any allocated inodes in that cluster. */ STATIC void xfs_iwalk_ichunk_ra( struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agblock_t agbno; struct blk_plug plug; int i; /* inode chunk index */ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); blk_start_plug(&plug); for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) { xfs_inofree_t imask; imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { xfs_buf_readahead(mp->m_ddev_targp, xfs_agbno_to_daddr(pag, agbno), igeo->blocks_per_cluster * mp->m_bsize, &xfs_inode_buf_ops); } agbno += igeo->blocks_per_cluster; } blk_finish_plug(&plug); } /* * Set the bits in @irec's free mask that correspond to the inodes before * @agino so that we skip them. This is how we restart an inode walk that was * interrupted in the middle of an inode record. */ STATIC void xfs_iwalk_adjust_start( xfs_agino_t agino, /* starting inode of chunk */ struct xfs_inobt_rec_incore *irec) /* btree record */ { int idx; /* index into inode chunk */ idx = agino - irec->ir_startino; irec->ir_free |= xfs_inobt_maskn(0, idx); irec->ir_freecount = hweight64(irec->ir_free); } /* Allocate memory for a walk. */ STATIC int xfs_iwalk_alloc( struct xfs_iwalk_ag *iwag) { size_t size; ASSERT(iwag->recs == NULL); iwag->nr_recs = 0; /* Allocate a prefetch buffer for inobt records. */ size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (iwag->recs == NULL) return -ENOMEM; return 0; } /* Free memory we allocated for a walk. */ STATIC void xfs_iwalk_free( struct xfs_iwalk_ag *iwag) { kfree(iwag->recs); iwag->recs = NULL; } /* For each inuse inode in each cached inobt record, call our function. */ STATIC int xfs_iwalk_ag_recs( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; unsigned int i, j; int error; for (i = 0; i < iwag->nr_recs; i++) { struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; trace_xfs_iwalk_ag_rec(pag, irec); if (xfs_pwork_want_abort(&iwag->pwork)) return 0; if (iwag->inobt_walk_fn) { error = iwag->inobt_walk_fn(mp, tp, pag_agno(pag), irec, iwag->data); if (error) return error; } if (!iwag->iwalk_fn) continue; for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { if (xfs_pwork_want_abort(&iwag->pwork)) return 0; /* Skip if this inode is free */ if (XFS_INOBT_MASK(j) & irec->ir_free) continue; /* Otherwise call our function. */ error = iwag->iwalk_fn(mp, tp, xfs_agino_to_ino(pag, irec->ir_startino + j), iwag->data); if (error) return error; } } return 0; } /* Delete cursor and let go of AGI. */ static inline void xfs_iwalk_del_inobt( struct xfs_trans *tp, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, int error) { if (*curpp) { xfs_btree_del_cursor(*curpp, error); *curpp = NULL; } if (*agi_bpp) { xfs_trans_brelse(tp, *agi_bpp); *agi_bpp = NULL; } } /* * Set ourselves up for walking inobt records starting from a given point in * the filesystem. * * If caller passed in a nonzero start inode number, load the record from the * inobt and make the record look like all the inodes before agino are free so * that we skip them, and then move the cursor to the next inobt record. This * is how we support starting an iwalk in the middle of an inode chunk. * * If the caller passed in a start number of zero, move the cursor to the first * inobt record. * * The caller is responsible for cleaning up the cursor and buffer pointer * regardless of the error status. */ STATIC int xfs_iwalk_ag_start( struct xfs_iwalk_ag *iwag, xfs_agino_t agino, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, int *has_more) { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; struct xfs_inobt_rec_incore *irec; int error; /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); /* Starting at the beginning of the AG? That's easy! */ if (agino == 0) return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); /* * Otherwise, we have to grab the inobt record where we left off, stuff * the record into our cache, and then see if there are more records. * We require a lookup cache of at least two elements so that the * caller doesn't have to deal with tearing down the cursor to walk the * records. */ error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more); if (error) return error; /* * If the LE lookup at @agino yields no records, jump ahead to the * inobt cursor increment to see if there are more records to process. */ if (!*has_more) goto out_advance; /* Get the record, should always work */ irec = &iwag->recs[iwag->nr_recs]; error = xfs_inobt_get_rec(*curpp, irec, has_more); if (error) return error; if (XFS_IS_CORRUPT(mp, *has_more != 1)) { xfs_btree_mark_sick(*curpp); return -EFSCORRUPTED; } iwag->lastino = xfs_agino_to_ino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); /* * If the LE lookup yielded an inobt record before the cursor position, * skip it and see if there's another one after it. */ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) goto out_advance; /* * If agino fell in the middle of the inode record, make it look like * the inodes up to agino are free so that we don't return them again. */ if (iwag->trim_start) xfs_iwalk_adjust_start(agino, irec); /* * The prefetch calculation is supposed to give us a large enough inobt * record cache that grab_ichunk can stage a partial first record and * the loop body can cache a record without having to check for cache * space until after it reads an inobt record. */ iwag->nr_recs++; ASSERT(iwag->nr_recs < iwag->sz_recs); out_advance: return xfs_btree_increment(*curpp, 0, has_more); } /* * The inobt record cache is full, so preserve the inobt cursor state and * run callbacks on the cached inobt records. When we're done, restore the * cursor state to wherever the cursor would have been had the cache not been * full (and therefore we could've just incremented the cursor) if *@has_more * is true. On exit, *@has_more will indicate whether or not the caller should * try for more inode records. */ STATIC int xfs_iwalk_run_callbacks( struct xfs_iwalk_ag *iwag, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, int *has_more) { struct xfs_mount *mp = iwag->mp; xfs_agino_t next_agino; int error; next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); ASSERT(next_agino >= iwag->recs[iwag->nr_recs - 1].ir_startino + XFS_INODES_PER_CHUNK); if (iwag->drop_trans) { xfs_trans_cancel(iwag->tp); iwag->tp = NULL; } error = xfs_iwalk_ag_recs(iwag); if (error) return error; /* ...empty the cache... */ iwag->nr_recs = 0; if (!has_more) return 0; if (iwag->drop_trans) iwag->tp = xfs_trans_alloc_empty(mp); /* ...and recreate the cursor just past where we left off. */ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp); if (error) return error; *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ STATIC int xfs_iwalk_ag( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; xfs_agino_t agino; int has_more; int error = 0; /* Set up our cursor at the right place in the inode btree. */ ASSERT(pag_agno(pag) == XFS_INO_TO_AGNO(mp, iwag->startino)); agino = XFS_INO_TO_AGINO(mp, iwag->startino); error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); while (!error && has_more) { struct xfs_inobt_rec_incore *irec; xfs_ino_t rec_fsino; cond_resched(); if (xfs_pwork_want_abort(&iwag->pwork)) goto out; /* Fetch the inobt record. */ irec = &iwag->recs[iwag->nr_recs]; error = xfs_inobt_get_rec(cur, irec, &has_more); if (error || !has_more) break; /* Make sure that we always move forward. */ rec_fsino = xfs_agino_to_ino(pag, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; /* No allocated inodes in this chunk; skip it. */ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { error = xfs_btree_increment(cur, 0, &has_more); if (error) break; continue; } /* * Start readahead for this inode chunk in anticipation of * walking the inodes. */ if (iwag->iwalk_fn) xfs_iwalk_ichunk_ra(mp, pag, irec); /* * If there's space in the buffer for more records, increment * the btree cursor and grab more. */ if (++iwag->nr_recs < iwag->sz_recs) { error = xfs_btree_increment(cur, 0, &has_more); if (error || !has_more) break; continue; } /* * Otherwise, we need to save cursor state and run the callback * function on the cached records. The run_callbacks function * is supposed to return a cursor pointing to the record where * we would be if we had been able to increment like above. */ ASSERT(has_more); error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); } if (iwag->nr_recs == 0 || error) goto out; /* Walk the unprocessed records in the cache. */ error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); return error; } /* * We experimentally determined that the reduction in ioctl call overhead * diminishes when userspace asks for more than 2048 inodes, so we'll cap * prefetch at this point. */ #define IWALK_MAX_INODE_PREFETCH (2048U) /* * Given the number of inodes to prefetch, set the number of inobt records that * we cache in memory, which controls the number of inodes we try to read * ahead. Set the maximum if @inodes == 0. */ static inline unsigned int xfs_iwalk_prefetch( unsigned int inodes) { unsigned int inobt_records; /* * If the caller didn't tell us the number of inodes they wanted, * assume the maximum prefetch possible for best performance. * Otherwise, cap prefetch at that maximum so that we don't start an * absurd amount of prefetch. */ if (inodes == 0) inodes = IWALK_MAX_INODE_PREFETCH; inodes = min(inodes, IWALK_MAX_INODE_PREFETCH); /* Round the inode count up to a full chunk. */ inodes = round_up(inodes, XFS_INODES_PER_CHUNK); /* * In order to convert the number of inodes to prefetch into an * estimate of the number of inobt records to cache, we require a * conversion factor that reflects our expectations of the average * loading factor of an inode chunk. Based on data gathered, most * (but not all) filesystems manage to keep the inode chunks totally * full, so we'll underestimate slightly so that our readahead will * still deliver the performance we want on aging filesystems: * * inobt = inodes / (INODES_PER_CHUNK * (4 / 5)); * * The funny math is to avoid integer division. */ inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK); /* * Allocate enough space to prefetch at least two inobt records so that * we can cache both the record where the iwalk started and the next * record. This simplifies the AG inode walk loop setup code. */ return max(inobt_records, 2U); } static int xfs_iwalk_args( struct xfs_iwalk_ag *iwag, unsigned int flags) { struct xfs_mount *mp = iwag->mp; xfs_agnumber_t start_agno; int error; start_agno = XFS_INO_TO_AGNO(iwag->mp, iwag->startino); ASSERT(start_agno < iwag->mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); error = xfs_iwalk_alloc(iwag); if (error) return error; while ((iwag->pag = xfs_perag_next_from(mp, iwag->pag, start_agno))) { error = xfs_iwalk_ag(iwag); if (error || (flags & XFS_IWALK_SAME_AG)) { xfs_perag_rele(iwag->pag); break; } iwag->startino = XFS_AGINO_TO_INO(mp, pag_agno(iwag->pag) + 1, 0); } xfs_iwalk_free(iwag); return error; } /* * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn * will be called for each allocated inode, being passed the inode's number and * @data. @max_prefetch controls how many inobt records' worth of inodes we * try to readahead. */ int xfs_iwalk( struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, unsigned int inode_records, void *data) { struct xfs_iwalk_ag iwag = { .mp = mp, .tp = tp, .iwalk_fn = iwalk_fn, .data = data, .startino = startino, .sz_recs = xfs_iwalk_prefetch(inode_records), .trim_start = 1, .skip_empty = 1, .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; return xfs_iwalk_args(&iwag, flags); } /* Run per-thread iwalk work. */ static int xfs_iwalk_ag_work( struct xfs_mount *mp, struct xfs_pwork *pwork) { struct xfs_iwalk_ag *iwag; int error = 0; iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); if (xfs_pwork_want_abort(pwork)) goto out; error = xfs_iwalk_alloc(iwag); if (error) goto out; /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ iwag->tp = xfs_trans_alloc_empty(mp); iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); if (iwag->tp) xfs_trans_cancel(iwag->tp); xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag); kfree(iwag); return error; } /* * Walk all the inodes in the filesystem using multiple threads to process each * AG. */ int xfs_iwalk_threaded( struct xfs_mount *mp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, unsigned int inode_records, bool polled, void *data) { xfs_agnumber_t start_agno = XFS_INO_TO_AGNO(mp, startino); struct xfs_pwork_ctl pctl; struct xfs_perag *pag = NULL; int error; ASSERT(start_agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error; while ((pag = xfs_perag_next_from(mp, pag, start_agno))) { struct xfs_iwalk_ag *iwag; if (xfs_pwork_ctl_want_abort(&pctl)) break; iwag = kzalloc_obj(struct xfs_iwalk_ag, GFP_KERNEL | __GFP_NOFAIL); iwag->mp = mp; /* * perag is being handed off to async work, so take a passive * reference for the async work to release. */ iwag->pag = xfs_perag_hold(pag); iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); startino = XFS_AGINO_TO_INO(mp, pag_agno(pag) + 1, 0); if (flags & XFS_IWALK_SAME_AG) break; } if (pag) xfs_perag_rele(pag); if (polled) xfs_pwork_poll(&pctl); return xfs_pwork_destroy(&pctl); } /* * Allow callers to cache up to a page's worth of inobt records. This reflects * the existing inumbers prefetching behavior. Since the inobt walk does not * itself do anything with the inobt records, we can set a fairly high limit * here. */ #define MAX_INOBT_WALK_PREFETCH \ (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore)) /* * Given the number of records that the user wanted, set the number of inobt * records that we buffer in memory. Set the maximum if @inobt_records == 0. */ static inline unsigned int xfs_inobt_walk_prefetch( unsigned int inobt_records) { /* * If the caller didn't tell us the number of inobt records they * wanted, assume the maximum prefetch possible for best performance. */ if (inobt_records == 0) inobt_records = MAX_INOBT_WALK_PREFETCH; /* * Allocate enough space to prefetch at least two inobt records so that * we can cache both the record where the iwalk started and the next * record. This simplifies the AG inode walk loop setup code. */ inobt_records = max(inobt_records, 2U); /* * Cap prefetch at that maximum so that we don't use an absurd amount * of memory. */ return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH); } /* * Walk all inode btree records in the filesystem starting from @startino. The * @inobt_walk_fn will be called for each btree record, being passed the incore * record and @data. @max_prefetch controls how many inobt records we try to * cache ahead of time. */ int xfs_inobt_walk( struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, unsigned int flags, xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records, void *data) { struct xfs_iwalk_ag iwag = { .mp = mp, .tp = tp, .inobt_walk_fn = inobt_walk_fn, .data = data, .startino = startino, .sz_recs = xfs_inobt_walk_prefetch(inobt_records), .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; return xfs_iwalk_args(&iwag, flags); }
1 2 14 16 16 13 2 11 6 6 2 4 4 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 /* * algif_rng: User-space interface for random number generators * * This file provides the user-space API for random number generators. * * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 * are required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/capability.h> #include <linux/module.h> #include <crypto/rng.h> #include <linux/random.h> #include <crypto/if_alg.h> #include <linux/net.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>"); MODULE_DESCRIPTION("User-space interface for random number generators"); struct rng_ctx { #define MAXSIZE 128 unsigned int len; struct crypto_rng *drng; u8 *addtl; size_t addtl_len; }; struct rng_parent_ctx { struct crypto_rng *drng; u8 *entropy; }; static void rng_reset_addtl(struct rng_ctx *ctx) { kfree_sensitive(ctx->addtl); ctx->addtl = NULL; ctx->addtl_len = 0; } static int _rng_recvmsg(struct crypto_rng *drng, struct msghdr *msg, size_t len, u8 *addtl, size_t addtl_len) { int err = 0; int genlen = 0; u8 result[MAXSIZE]; if (len == 0) return 0; if (len > MAXSIZE) len = MAXSIZE; /* * although not strictly needed, this is a precaution against coding * errors */ memset(result, 0, len); /* * The enforcement of a proper seeding of an RNG is done within an * RNG implementation. Some RNGs (DRBG, krng) do not need specific * seeding as they automatically seed. The X9.31 DRNG will return * an error if it was not seeded properly. */ genlen = crypto_rng_generate(drng, addtl, addtl_len, result, len); if (genlen < 0) return genlen; err = memcpy_to_msg(msg, result, len); memzero_explicit(result, len); return err ? err : len; } static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; return _rng_recvmsg(ctx->drng, msg, len, NULL, 0); } static int rng_test_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; int ret; lock_sock(sock->sk); ret = _rng_recvmsg(ctx->drng, msg, len, ctx->addtl, ctx->addtl_len); rng_reset_addtl(ctx); release_sock(sock->sk); return ret; } static int rng_test_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct alg_sock *ask = alg_sk(sock->sk); struct rng_ctx *ctx = ask->private; lock_sock(sock->sk); if (len > MAXSIZE) { err = -EMSGSIZE; goto unlock; } rng_reset_addtl(ctx); ctx->addtl = kmalloc(len, GFP_KERNEL); if (!ctx->addtl) { err = -ENOMEM; goto unlock; } err = memcpy_from_msg(ctx->addtl, msg, len); if (err) { rng_reset_addtl(ctx); goto unlock; } ctx->addtl_len = len; unlock: release_sock(sock->sk); return err ? err : len; } static struct proto_ops algif_rng_ops = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, .sendmsg = sock_no_sendmsg, .release = af_alg_release, .recvmsg = rng_recvmsg, }; static struct proto_ops __maybe_unused algif_rng_test_ops = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, .release = af_alg_release, .recvmsg = rng_test_recvmsg, .sendmsg = rng_test_sendmsg, }; static void *rng_bind(const char *name, u32 type, u32 mask) { struct rng_parent_ctx *pctx; struct crypto_rng *rng; pctx = kzalloc_obj(*pctx); if (!pctx) return ERR_PTR(-ENOMEM); rng = crypto_alloc_rng(name, type, mask); if (IS_ERR(rng)) { kfree(pctx); return ERR_CAST(rng); } pctx->drng = rng; return pctx; } static void rng_release(void *private) { struct rng_parent_ctx *pctx = private; if (unlikely(!pctx)) return; crypto_free_rng(pctx->drng); kfree_sensitive(pctx->entropy); kfree_sensitive(pctx); } static void rng_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; rng_reset_addtl(ctx); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); } static int rng_accept_parent(void *private, struct sock *sk) { struct rng_ctx *ctx; struct rng_parent_ctx *pctx = private; struct alg_sock *ask = alg_sk(sk); unsigned int len = sizeof(*ctx); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) return -ENOMEM; memset(ctx, 0, len); ctx->len = len; /* * No seeding done at that point -- if multiple accepts are * done on one RNG instance, each resulting FD points to the same * state of the RNG. */ ctx->drng = pctx->drng; ask->private = ctx; sk->sk_destruct = rng_sock_destruct; /* * Non NULL pctx->entropy means that CAVP test has been initiated on * this socket, replace proto_ops algif_rng_ops with algif_rng_test_ops. */ if (IS_ENABLED(CONFIG_CRYPTO_USER_API_RNG_CAVP) && pctx->entropy) sk->sk_socket->ops = &algif_rng_test_ops; return 0; } static int rng_setkey(void *private, const u8 *seed, unsigned int seedlen) { struct rng_parent_ctx *pctx = private; /* * Check whether seedlen is of sufficient size is done in RNG * implementations. */ return crypto_rng_reset(pctx->drng, seed, seedlen); } static int __maybe_unused rng_setentropy(void *private, sockptr_t entropy, unsigned int len) { struct rng_parent_ctx *pctx = private; u8 *kentropy = NULL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (pctx->entropy) return -EINVAL; if (len > MAXSIZE) return -EMSGSIZE; if (len) { kentropy = memdup_sockptr(entropy, len); if (IS_ERR(kentropy)) return PTR_ERR(kentropy); } crypto_rng_alg(pctx->drng)->set_ent(pctx->drng, kentropy, len); /* * Since rng doesn't perform any memory management for the entropy * buffer, save kentropy pointer to pctx now to free it after use. */ pctx->entropy = kentropy; return 0; } static const struct af_alg_type algif_type_rng = { .bind = rng_bind, .release = rng_release, .accept = rng_accept_parent, .setkey = rng_setkey, #ifdef CONFIG_CRYPTO_USER_API_RNG_CAVP .setentropy = rng_setentropy, #endif .ops = &algif_rng_ops, .name = "rng", .owner = THIS_MODULE }; static int __init rng_init(void) { return af_alg_register_type(&algif_type_rng); } static void __exit rng_exit(void) { int err = af_alg_unregister_type(&algif_type_rng); BUG_ON(err); } module_init(rng_init); module_exit(rng_exit);
714 76 386 4520 46 9 1347 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { unsigned long *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(unsigned long *word, int bit); int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the target bit, even * if other processes on the same queue are waiting for other bits. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls io_schedule() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse * @word: the address containing the bit being waited on * @bit: the bit at that address being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The * clearing of the bit must be signalled with wake_up_bit(), often as * clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), except it also takes a timeout * parameter. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or %-EINTR if the process received a * signal and the mode permitted wake up on that signal, or %-EAGAIN if the * timeout elapsed. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the address containing the bit waited on * @bit: the bit at that address being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) * to be cleared. The clearing of the bit must be signalled with * wake_up_bit(), often as clear_and_wake_up_bit(). * * This is similar to wait_on_bit(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared in which case the * call has ACQUIRE semantics, or the error code returned by @action if * that call returned non-zero. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit(), but sets the bit before returning. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or %-EINTR if the process received a signal and * the mode permitted wake up on that signal. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * * Wait for the given bit in an unsigned long or bitmap (see * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As * soon as it is clear, atomically set it and return. * * This is similar to wait_on_bit_lock(), but calls io_schedule() instead * of schedule(). * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, then set it * @word: the address containing the bit being waited on * @bit: the bit of the word being waited on and set * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * This is similar to wait_on_bit_lock(), but calls @action() instead of * schedule() for the actual waiting. * * Returned value will be zero if the bit was successfully set in which * case the call has the same memory sequencing semantics as * test_and_clear_bit(), or the error code returned by @action if that * call returned non-zero. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define __wait_var_event_io(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /** * wait_var_event - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true, only re-checking when a wake up is * received for the given @var (an arbitrary kernel address which need * not be directly related to the given condition, but usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) /** * wait_var_event_io - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for an IO related @condition to be true, only re-checking when a * wake up is received for the given @var (an arbitrary kernel address * which need not be directly related to the given condition, but * usually is). * * The process will wait on a waitqueue selected by hash from a shared * pool. It will only be woken on a wake_up for the given address. * * This is similar to wait_var_event(), but calls io_schedule() instead * of schedule(). * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_io(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event_io(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a fatal signal to be received, * only re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal * was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire * @var: the address of variable being waited on * @condition: the condition to wait for * @timeout: maximum time to wait in jiffies * * Wait for a @condition to be true or a timeout to expire, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the timeout expired and the condition was still false, or the * remaining time left in the timeout (but at least 1) if the condition * was found to be true. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_var_event_killable - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * * Wait for a @condition to be true or a signal to be received, only * re-checking the condition when a wake up is received for the given * @var (an arbitrary kernel address which need not be directly related * to the given condition, but usually is). * * This is similar to wait_var_event() but returns a value which is 0 if * the condition became true, or %-ERESTARTSYS if a signal was received. * * The condition should normally use smp_load_acquire() or a similarly * ordered access to ensure that any changes to memory made before the * condition became true will be visible after the wait completes. */ #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * wait_var_event_any_lock - wait for a variable to be updated under a lock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the object that is locked to protect updates to the variable * @type: prefix on lock and unlock operations * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. * * Wait for a condition which can only be reliably tested while holding * a lock. The variables assessed in the condition will normal be updated * under the same lock, and the wake up should be signalled with * wake_up_var_locked() under the same lock. * * This is similar to wait_var_event(), but assumes a lock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. The functions used to * unlock and lock the object are constructed by appending _unlock and _lock * to @type. * * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt * the wait according to @state. */ #define wait_var_event_any_lock(var, condition, lock, type, state) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = ___wait_var_event(var, condition, state, 0, 0, \ type ## _unlock(lock); \ schedule(); \ type ## _lock(lock)); \ __ret; \ }) /** * wait_var_event_spinlock - wait for a variable to be updated under a spinlock * @var: the address of the variable being waited on * @condition: condition to wait for * @lock: the spinlock which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a spinlock. The variables assessed in the condition will normal be updated * under the same spinlock, and the wake up should be signalled with * wake_up_var_locked() under the same spinlock. * * This is similar to wait_var_event(), but assumes a spinlock is held * while calling this function and while updating the variable. * * This must be called while the given lock is held and the lock will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_spinlock(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) /** * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for * @mutex: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be * updated under the same mutex, and the wake up should be signalled * with wake_up_var_locked() under the same mutex. * * This is similar to wait_var_event(), but assumes a mutex is held * while calling this function and while updating the variable. * * This must be called while the given mutex is held and the mutex will be * dropped when schedule() is called to wait for a wake up, and will be * reclaimed before testing the condition again. */ #define wait_var_event_mutex(var, condition, lock) \ wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) /** * wake_up_var_protected - wake up waiters for a variable asserting that it is safe * @var: the address of the variable being waited on * @cond: the condition which afirms this is safe * * When waking waiters which use wait_var_event_any_lock() the waker must be * holding the reelvant lock to avoid races. This version of wake_up_var() * asserts that the relevant lock is held and so no barrier is needed. * The @cond is only tested when CONFIG_LOCKDEP is enabled. */ #define wake_up_var_protected(var, cond) \ do { \ lockdep_assert(cond); \ wake_up_var(var); \ } while (0) /** * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex * @var: the address of the variable being waited on * @lock: The spinlock or mutex what protects the variable * * Send a wake up for the given variable which should be waited for with * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), * no extra barriers are needed as the locking provides sufficient sequencing. */ #define wake_up_var_locked(var, lock) \ wake_up_var_protected(var, lockdep_is_held(lock)) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address containing the bit being waited on * * The designated bit is cleared and any tasks waiting in wait_on_bit() * or similar will be woken. This call has RELEASE semantics so that * any changes to memory made before this call are guaranteed to be visible * after the corresponding wait_on_bit() completes. */ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } /** * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit * @bit: the bit of the word being waited on * @word: the address of memory containing that bit * * If the bit is set and can be atomically cleared, any tasks waiting in * wait_on_bit() or similar will be woken. This call has the same * complete ordering semantics as test_and_clear_bit(). Any changes to * memory made before this call are guaranteed to be visible after the * corresponding wait_on_bit() completes. * * Returns %true if the bit was successfully set and the wake up was sent. */ static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) { if (!test_and_clear_bit(bit, word)) return false; /* no extra barrier required */ wake_up_bit(word, bit); return true; } /** * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters * @var: the variable to dec and test * * Decrements the atomic variable and if it reaches zero, send a wake_up to any * processes waiting on the variable. * * This function has the same complete ordering semantics as atomic_dec_and_test. * * Returns %true is the variable reaches zero and the wake up was sent. */ static inline bool atomic_dec_and_wake_up(atomic_t *var) { if (!atomic_dec_and_test(var)) return false; /* No extra barrier required */ wake_up_var(var); return true; } /** * store_release_wake_up - update a variable and send a wake_up * @var: the address of the variable to be updated and woken * @val: the value to store in the variable. * * Store the given value in the variable send a wake up to any tasks * waiting on the variable. All necessary barriers are included to ensure * the task calling wait_var_event() sees the new value and all values * written to memory before this call. */ #define store_release_wake_up(var, val) \ do { \ smp_store_release(var, val); \ smp_mb(); \ wake_up_var(var); \ } while (0) #endif /* _LINUX_WAIT_BIT_H */
254 256 255 77 252 254 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/once.h> #include <linux/random.h> #include <linux/module.h> struct once_work { struct work_struct work; struct static_key_true *key; struct module *module; }; static void once_deferred(struct work_struct *w) { struct once_work *work; work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); module_put(work->module); kfree(work); } static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; w = kmalloc_obj(*w, GFP_ATOMIC); if (!w) return; INIT_WORK(&w->work, once_deferred); w->key = key; w->module = mod; __module_get(mod); schedule_work(&w->work); } static DEFINE_SPINLOCK(once_lock); bool __do_once_start(bool *done, unsigned long *flags) __acquires(once_lock) { spin_lock_irqsave(&once_lock, *flags); if (*done) { spin_unlock_irqrestore(&once_lock, *flags); /* Keep sparse happy by restoring an even lock count on * this lock. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE() macro. */ __acquire(once_lock); return false; } return true; } EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); static DEFINE_MUTEX(once_mutex); bool __do_once_sleepable_start(bool *done) __acquires(once_mutex) { mutex_lock(&once_mutex); if (*done) { mutex_unlock(&once_mutex); /* Keep sparse happy by restoring an even lock count on * this mutex. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE_SLEEPABLE() macro. */ __acquire(once_mutex); return false; } return true; } EXPORT_SYMBOL(__do_once_sleepable_start); void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, struct module *mod) __releases(once_mutex) { *done = true; mutex_unlock(&once_mutex); static_branch_disable(once_key); } EXPORT_SYMBOL(__do_once_sleepable_done);
51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_DISK_H #define _SCSI_DISK_H /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that * much numberspace. */ #define SD_MAJORS 16 /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ #define SD_TIMEOUT (30 * HZ) #define SD_MOD_TIMEOUT (75 * HZ) /* * Flush timeout is a multiplier over the standard device timeout which is * user modifiable via sysfs but initially set to SD_TIMEOUT */ #define SD_FLUSH_TIMEOUT_MULTIPLIER 2 #define SD_WRITE_SAME_TIMEOUT (120 * HZ) /* * Number of allowed retries */ #define SD_MAX_RETRIES 5 #define SD_PASSTHROUGH_RETRIES 1 #define SD_MAX_MEDIUM_TIMEOUTS 2 /* * Size of the initial data buffer for mode and read capacity data */ #define SD_BUF_SIZE 512 /* * Number of sectors at the end of the device to avoid multi-sector * accesses to in the case of last_sector_bug */ #define SD_LAST_BUGGY_SECTORS 8 enum { SD_EXT_CDB_SIZE = 32, /* Extended CDB size */ SD_MEMPOOL_SIZE = 2, /* CDB pool size */ }; enum { SD_DEF_XFER_BLOCKS = 0xffff, SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, }; enum { SD_LBP_FULL = 0, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; enum { SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; /** * struct zoned_disk_info - Specific properties of a ZBC SCSI device. * @nr_zones: number of zones. * @zone_blocks: number of logical blocks per zone. * * This data structure holds the ZBC SCSI device properties that are retrieved * twice: a first time before the gendisk capacity is known and a second time * after the gendisk capacity is known. */ struct zoned_disk_info { u32 nr_zones; u32 zone_blocks; }; struct scsi_disk { struct scsi_device *device; /* * disk_dev is used to show attributes in /sys/class/scsi_disk/, * but otherwise not really needed. Do not use for refcounting. */ struct device disk_dev; struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED /* Updated during revalidation before the gendisk capacity is known. */ struct zoned_disk_info early_zone_info; /* Updated during revalidation after the gendisk capacity is known. */ struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; /* * Either zero or a power of two. If not zero it means that the offset * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ int max_retries; u32 min_xfer_blocks; u32 max_xfer_blocks; u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; u32 max_atomic; u32 atomic_alignment; u32 atomic_granularity; u32 max_atomic_with_boundary; u32 max_atomic_boundary; u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; /* number of permanent streams */ u16 permanent_stream_count; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; u8 nr_actuators; /* Number of actuators */ bool suspended; /* Disk is suspended (stopped) */ unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ unsigned RCD : 1; /* state of disk RCD bit, unused */ unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ unsigned first_scan : 1; unsigned lbpme : 1; unsigned lbprz : 1; unsigned lbpu : 1; unsigned lbpws : 1; unsigned lbpws10 : 1; unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; unsigned rc_basis: 2; unsigned zoned: 2; unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; unsigned rscs : 1; /* reduced stream control support */ unsigned use_atomic_write_boundary : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) static inline struct scsi_disk *scsi_disk(struct gendisk *disk) { return disk->private_data; } #define sd_printk(prefix, sdsk, fmt, a...) \ (sdsk)->disk ? \ sdev_prefix_printk(prefix, (sdsk)->device, \ (sdsk)->disk->disk_name, fmt, ##a) : \ sdev_printk(prefix, (sdsk)->device, fmt, ##a) #define sd_first_printk(prefix, sdsk, fmt, a...) \ do { \ if ((sdsk)->first_scan) \ sd_printk(prefix, sdsk, fmt, ##a); \ } while (0) static inline int scsi_medium_access_command(struct scsi_cmnd *scmd) { switch (scmd->cmnd[0]) { case READ_6: case READ_10: case READ_12: case READ_16: case SYNCHRONIZE_CACHE: case VERIFY: case VERIFY_12: case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: case WRITE_16: case WRITE_SAME: case WRITE_SAME_16: case UNMAP: return 1; case VARIABLE_LENGTH_CMD: switch (scmd->cmnd[9]) { case READ_32: case VERIFY_32: case WRITE_32: case WRITE_SAME_32: return 1; } } return 0; } static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks) { return blocks << (ilog2(sdev->sector_size) - 9); } static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks) { return blocks * sdev->sector_size; } static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes) { return bytes >> ilog2(sdev->sector_size); } static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector) { return sector >> (ilog2(sdev->sector_size) - 9); } void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, struct blk_report_zones_args *args); #else /* CONFIG_BLK_DEV_ZONED */ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { return 0; } static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all) { return BLK_STS_TARGET; } static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { return good_bytes; } #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr); void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result); #endif /* _SCSI_DISK_H */
5806 346 26 251 9870 3953 85 12 4 19988 27 4505 4264 3808 4263 400 689 744 12 176 9999 9990 9991 10004 6702 6589 221 2445 1 1266 843 257 588 3379 9802 6552 6555 2433 1446 1794 776 783 120 18 16 519 158 3970 641 2 1 1 2 106 2 100 6 104 104 98 106 7 103 14905 73 56 14923 4 578 9253 9734 4075 9420 578 575 578 578 574 14 5 14 85 85 3953 3921 80 81 3918 4 3915 3918 85 3897 3934 3949 3955 5 5 253 3997 55 403 103 94 64 102 102 103 103 82 19 3985 3925 261 3984 3986 3985 205 205 205 16 15 16 16 16 4049 4053 3960 3975 560 4055 243 4093 244 5 4 3 3815 3819 3816 4260 4263 4027 3917 113 4029 3960 1977 113 113 113 574 354 232 573 2 568 574 328 242 42 249 10 247 8 630 630 629 1 630 7 7 616 4496 4501 4402 4398 638 640 639 635 9 9 625 14 18357 18360 17387 4495 20873 14271 18350 3 19971 19988 100 52 31 27 5 5 5 22 22 223 29 224 223 224 25 223 224 209 224 224 3 2 3 2 4238 4239 4237 4239 4205 374 4184 416 416 416 348 35 49 414 358 57 413 9 405 414 414 414 358 57 414 414 414 9 405 31 383 2 7 9 82 355 414 414 1 5 3 65 1 4 3 56 9 6 12 33 27 5 5 5 2 3 5 149 846 1 26 1 73 80 80 73 1 15 79 92 91 244 27 88 88 225 225 225 225 39 168 190 186 6 32 165 186 184 2 35 153 17 17 17 14 8 346 200 72 24 31 31 310 4 299 81 7 306 306 19 20 10 7 4 92 25 84 19 3816 3815 3819 3815 4 1 3 3817 3796 78 3797 78 3 3 3805 15 15 3818 3815 3816 3816 3818 2 3819 3819 3814 2 15 3819 1 13 3945 3 16 3954 3952 1 3955 3946 26 26 3955 5 86 3946 3 3940 3946 26 3951 3950 75 3950 55 3 74 3806 2 40 20 1 17 7 24 86 4 8 7 102 4 100 35 65 60 1 59 11 1 4 50 7 59 7 36 12 32 39 39 12 32 31 32 6 6 2 4 1 4 1 5 6 6 2 5 64 1 1 62 61 12 6 39 6 65 65 1 2 369 14 2 2 13 4057 657 3322 172 172 6 8 14 14 13 45 91 325 15 1 76 244 8 8 1 1 3 1 1 1 4 5 2 2 24 11 1 1 4 2 5 18 15 3 51 12 4 3 3 1 14 3 1 18 19 10 1 24 22 18 37 8 1 7 75 3832 4 3835 4 3833 5661 1974 1 3852 5658 6081 5900 196 6080 1373 4780 357 5665 6 6 6 6 7 7 7 27 2 27 27 27 640 6010 2 1 1942 6449 569 6000 6 104 3 5613 844 6294 140 20 333 60 40 8 6086 187 6452 374 345 28 372 567 567 902 756 137 19 153 155 153 155 152 155 155 153 155 306 1 305 6528 446 6203 1836 4904 6513 6012 6515 84 1 1 11 71 7 3 1 7 1 63 2 65 65 65 62 76 63 3 16 44 85 1 1 82 2 17 53 11 70 2 14 17 49 14 26 17 15 28 1 2 4 2 1 1 23 5 7 2 1 6 9 9 27 37 8 3 29 30 16 15 23 31 30 31 14 12 5 6 7 6 27 21 27 27 3 24 26 40 3 38 6 21 11 38 31 27 6 24 10 41 10 4 1 3 2 2 2 6 3 2 2 2 16 3 40 55 43 43 79 1 2 1 65 53 16 68 81 1 37 43 35 4 40 43 109 107 109 1 955 957 208 135 135 85 127 127 144 32 31 32 2 2 1 2 2 3911 3910 2 2 12 12 19 2 17 19 31 2 2 2 25 25 164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/pidfs.h> #include <linux/nstree.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __ro_after_init; static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { return kstrtoul(str, 0, &mhash_entries) == 0; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { return kstrtoul(str, 0, &mphash_entries) == 0; } __setup("mphash_entries=", set_mphash_entries); static char * __initdata initramfs_options; static int __init initramfs_options_setup(char *str) { initramfs_options = str; return 1; } __setup("initramfs_options=", initramfs_options_setup); static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static inline void namespace_lock(void); static void namespace_unlock(void); DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), up_read(&namespace_sem)) DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), }; struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (ns_tree_active(ns)) ns_tree_remove(ns); call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* * Lookup a mount namespace by id and take a passive reference count. Taking a * passive reference means the mount namespace can be emptied if e.g., the last * task holding an active reference exits. To access the mounts of the * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. * * Note the lookup is lockless protected by a sequence counter. We only * need to guard against false negatives as false positives aren't * possible. So if we didn't find a mount namespace and the sequence * counter has changed we need to retry. If the sequence counter is * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { struct mnt_namespace *mnt_ns; struct ns_common *ns; guard(rcu)(); ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ mnt_ns = container_of(ns, struct mnt_namespace, ns); refcount_inc(&mnt_ns->passive); return mnt_ns; } static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> m_hash_shift); return &mount_hashtable[tmp & m_hash_mask]; } static inline struct hlist_head *mp_hash(struct dentry *dentry) { unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> mp_hash_shift); return &mountpoint_hashtable[tmp & mp_hash_mask]; } static int mnt_alloc_id(struct mount *mnt) { int res; xa_lock(&mnt_id_xa); res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL); if (!res) mnt->mnt_id_unique = ++mnt_id_ctr; xa_unlock(&mnt_id_xa); return res; } static void mnt_free_id(struct mount *mnt) { xa_erase(&mnt_id_xa, mnt->mnt_id); } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); if (res < 0) return res; mnt->mnt_group_id = res; return 0; } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else preempt_disable(); mnt->mnt_count += n; preempt_enable(); #endif } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; } return count; #else return mnt->mnt_count; #endif } static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; err = mnt_alloc_id(mnt); if (err) goto out_free_cache; if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); else mnt->mnt_devname = "none"; if (!mnt->mnt_devname) goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); if (!mnt->mnt_pcp) goto out_free_devname; this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); INIT_HLIST_HEAD(&mnt->mnt_slave_list); INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; #ifdef CONFIG_SMP out_free_devname: kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); return NULL; } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(__mnt_is_readonly); static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; #else return mnt->mnt_writers; #endif } static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; /* * The barrier pairs with the barrier in sb_start_ro_state_change() * making sure if we don't see s_readonly_remount set yet, we also will * not see any superblock / mount flag changes done by remount. * It also pairs with the barrier in sb_end_ro_state_change() * assuring that if we see s_readonly_remount already cleared, we will * see the values of superblock / mount flags updated by remount. */ smp_rmb(); return __mnt_is_readonly(mnt); } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); read_seqlock_excl(&mount_lock); read_sequnlock_excl(&mount_lock); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; } preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { int ret; sb_start_write(m->mnt_sb); ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); /** * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be * paired with mnt_put_write_access_file. */ int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* * Superblock may have become readonly while there are still * writable fd's, e.g. due to a fs error with errors=remount-ro */ if (__mnt_is_readonly(file->f_path.mnt)) return -EROFS; return 0; } return mnt_get_write_access(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the freeze protection and the check for emergency r/o * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { int ret; sb_start_write(file_inode(file)->i_sb); ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * mnt_get_write_access() call above. */ void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); /** * mnt_hold_writers - prevent write access to the given mount * @mnt: mnt to prevent write access to * * Prevents write access to @mnt if there are no active writers for @mnt. * This function needs to be called and return successfully before changing * properties of @mnt that need to remain stable for callers with write access * to @mnt. * * After this functions has been called successfully callers must pair it with * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * * Context: This function expects to be in mount_locked_reader scope serializing * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { set_write_hold(mnt); /* * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); /* * With writers on hold, if this value is zero, then there are * definitely no active writers (although held writers may subsequently * increment the count, they'll have to wait, and decrement it after * seeing MNT_READONLY). * * It is OK to have counter incremented on one CPU and decremented on * another: the sum will add up correctly. The danger would be when we * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) return -EBUSY; return 0; } /** * mnt_unhold_writers - stop preventing write access to the given mount * @mnt: mnt to stop preventing write access to * * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * * This function can only be called after a call to mnt_hold_writers(). * * Context: This function expects to be in the same mount_locked_reader scope * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { if (!test_write_hold(mnt)) return; /* * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) { struct mount **p = m->mnt_pprev_for_sb; struct mount *next = m->mnt_next_for_sb; if (next) next->mnt_pprev_for_sb = p; *p = next; } static inline void mnt_add_instance(struct mount *m, struct super_block *s) { struct mount *first = s->s_mounts; if (first) first->mnt_pprev_for_sb = &m->mnt_next_for_sb; m->mnt_next_for_sb = first; m->mnt_pprev_for_sb = &s->s_mounts; s->s_mounts = m; } static int mnt_make_readonly(struct mount *mnt) { int ret; ret = mnt_hold_writers(mnt); if (!ret) mnt->mnt.mnt_flags |= MNT_READONLY; mnt_unhold_writers(mnt); return ret; } int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; guard(mount_locked_reader)(); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { if (!(m->mnt.mnt_flags & MNT_READONLY)) { err = mnt_hold_writers(m); if (err) break; } } if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { if (test_write_hold(m)) clear_write_hold(m); } return err; } static void free_vfsmnt(struct mount *mnt) { mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } static void delayed_free_vfsmnt(struct rcu_head *head) { free_vfsmnt(container_of(head, struct mount, mnt_rcu)); } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) return 1; if (bastard == NULL) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; lock_mount_hash(); if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; } unlock_mount_hash(); /* caller will mntput() */ return -1; } /* call under rcu_read_lock */ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) return true; if (unlikely(res < 0)) { rcu_read_unlock(); mntput(bastard); rcu_read_lock(); } return false; } /** * __lookup_mnt - mount hash lookup * @mnt: parent mount * @dentry: dentry of mountpoint * * If @mnt has a child mount @c mounted on @dentry find and return it. * Caller must either hold the spinlock component of @mount_lock or * hold rcu_read_lock(), sample the seqcount component before the call * and recheck it afterwards. * * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; } /** * lookup_mnt - Return the child mount mounted at given location * @path: location in the namespace * * Acquires and returns a new reference to mount at given location * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; unsigned seq; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry); m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq)); rcu_read_unlock(); return m; } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) if (mnt->mnt_mountpoint == dentry) return true; return false; } struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { hlist_add_head(&m->node, &mp->m_list); m->mp = mp; return true; } } return false; } static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct mountpoint *mp __free(kfree) = NULL; bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); if (found) return 0; } if (!mp) mp = kmalloc_obj(struct mountpoint); if (!mp) return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); /* Someone else set d_mounted? */ if (ret == -EBUSY) goto mountpoint; /* The dentry is not available as a mountpoint? */ if (ret) return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); mp->m_dentry = dget(dentry); hlist_add_head(&mp->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&mp->m_list); hlist_add_head(&m->node, &mp->m_list); m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } /* * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] */ static void unpin_mountpoint(struct pinned_mountpoint *m) { if (m->mp) { hlist_del(&m->node); maybe_free_mountpoint(m->mp, &ex_mountpoints); } } static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } static inline bool check_anonymous_mnt(struct mount *mnt) { u64 seq; if (!is_anon_ns(mnt->mnt_ns)) return false; seq = mnt->mnt_ns->seq_origin; return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; wake_up_interruptible(&ns->poll); } } /* * locks: mount_lock[write_seqlock] */ static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; struct mount *parent = mnt->mnt_parent; if (unlikely(parent->overmount == mnt)) parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; maybe_free_mountpoint(mp, shrink_list); } /* * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { __umount_mnt(mnt, &ex_mountpoints); } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } static void make_visible(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /** * attach_mnt - mount a mount, attach to @mount_hashtable and parent's * list of child mounts * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint * * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); attach_mnt(mnt, parent, mp); maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) { return node ? rb_entry(node, struct mount, mnt_node) : NULL; } static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; bool mnt_first_node = true, mnt_last_node = true; WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; mnt_last_node = false; } else { link = &parent->rb_right; mnt_first_node = false; } } if (mnt_last_node) ns->mnt_last_node = &mnt->mnt_node; if (mnt_first_node) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); mnt_notify_add(mnt); } static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { while (1) { if (p == root) return NULL; next = p->mnt_child.next; if (next != &p->mnt_parent->mnt_mounts) break; p = p->mnt_parent; } } return list_entry(next, struct mount, mnt_child); } static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; if (!mnt_ns_attached(mnt)) { for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) mnt_add_to_ns(n, m); n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; } make_visible(mnt); touch_mnt_namespace(n); } static void setup_mnt(struct mount *m, struct dentry *root) { struct super_block *s = root->d_sb; atomic_inc(&s->s_active); m->mnt.mnt_sb = s; m->mnt.mnt_root = dget(root); m->mnt_mountpoint = m->mnt.mnt_root; m->mnt_parent = m; guard(mount_locked_reader)(); mnt_add_instance(m, s); } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; if (!fc->root) return ERR_PTR(-EINVAL); mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; setup_mnt(mnt, fc->root); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); struct vfsmount *fc_mount(struct fs_context *fc) { int err = vfs_get_tree(fc); if (!err) { up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc); } return ERR_PTR(err); } EXPORT_SYMBOL(fc_mount); struct vfsmount *fc_mount_longterm(struct fs_context *fc) { struct vfsmount *mnt = fc_mount(fc); if (!IS_ERR(mnt)) real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; return mnt; } EXPORT_SYMBOL(fc_mount_longterm); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct fs_context *fc; struct vfsmount *mnt; int ret = 0; if (!type) return ERR_PTR(-EINVAL); fc = fs_context_for_mount(type, flags); if (IS_ERR(fc)) return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct mount *mnt; int err; mnt = alloc_vfsmnt(old->mnt_devname); if (!mnt) return ERR_PTR(-ENOMEM); mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & ~MNT_INTERNAL_FLAGS; if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { err = mnt_alloc_group_id(mnt); if (err) goto out_free; } if (mnt->mnt_group_id) set_mnt_shared(mnt); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; if (peers(mnt, old)) list_add(&mnt->mnt_share, &old->mnt_share); if ((flag & CL_SLAVE) && old->mnt_group_id) { hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; } else if (IS_MNT_SLAVE(old)) { hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; } return mnt; out_free: mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } static void cleanup_mnt(struct mount *mnt) { struct hlist_node *p; struct mount *m; /* * The warning here probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this happens, the * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } static void __cleanup_mnt(struct rcu_head *head) { cleanup_mnt(container_of(head, struct mount, mnt_rcu)); } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void noinline mntput_no_expire_slowpath(struct mount *mnt) { LIST_HEAD(list); int count; VFS_BUG_ON(mnt->mnt_ns); lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab * mount_lock, we'll see their refcount increment here. */ smp_mb(); mnt_add_count(mnt, -1); count = mnt_get_count(mnt); if (count != 0) { WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); return; } mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) schedule_delayed_work(&delayed_mntput_work, 1); return; } cleanup_mnt(mnt); } static void mntput_no_expire(struct mount *mnt) { rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* * Since we don't do lock_mount_hash() here, * ->mnt_ns can change under us. However, if it's * non-NULL, then there's a reference that won't * be dropped until after an RCU delay done after * turning ->mnt_ns NULL. So if we observe it * non-NULL under rcu_read_lock(), the reference * we are dropping is not the final one. */ mnt_add_count(mnt, -1); rcu_read_unlock(); return; } mntput_no_expire_slowpath(mnt); } void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); /* * Make a mount point inaccessible to new lookups. * Because there may still be current users, the caller MUST WAIT * for an RCU grace period before destroying the mount point. */ void mnt_make_shortterm(struct vfsmount *mnt) { if (mnt) real_mount(mnt)->mnt_ns = NULL; } /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { unsigned seq; bool res; if (!d_mountpoint(path->dentry)) return false; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); res = __path_is_mountpoint(path); } while (read_seqretry(&mount_lock, seq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL(path_is_mountpoint); struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); if (IS_ERR(p)) return ERR_CAST(p); p->mnt.mnt_flags |= MNT_INTERNAL; return &p->mnt; } /* * Returns the mount which either has the specified mnt_id, or has the next * smallest id afer the specified one. */ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id <= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_left; } else { node = node->rb_right; } } return ret; } /* * Returns the mount which either has the specified mnt_id, or has the next * greater id before the specified one. */ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id >= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_right; } else { node = node->rb_left; } } return ret; } #ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; struct mount *mnt; down_read(&namespace_sem); mnt = mnt_find_id_at(p->ns, *pos); if (mnt) *pos = mnt->mnt_id_unique; return mnt; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct mount *mnt = v; struct rb_node *node = rb_next(&mnt->mnt_node); if (node) { struct mount *next = node_to_mount(node); *pos = next->mnt_id_unique; return next; } /* * No more mounts. Set pos past current mount's ID so that if * iteration restarts, mnt_find_id_at() returns NULL. */ *pos = mnt->mnt_id_unique + 1; return NULL; } static void m_stop(struct seq_file *m, void *v) { up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *r = v; return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show, }; #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { busy = true; break; } } unlock_mount_hash(); return !busy; } EXPORT_SYMBOL(may_umount_tree); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); #ifdef CONFIG_FSNOTIFY static void mnt_notify(struct mount *p) { if (!p->prev_ns && p->mnt_ns) { fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } else if (p->prev_ns && !p->mnt_ns) { fsnotify_mnt_detach(p->prev_ns, &p->mnt); } else if (p->prev_ns == p->mnt_ns) { fsnotify_mnt_move(p->mnt_ns, &p->mnt); } else { fsnotify_mnt_detach(p->prev_ns, &p->mnt); fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } p->prev_ns = p->mnt_ns; } static void notify_mnt_list(void) { struct mount *m, *tmp; /* * Notify about mounts that were added/reparented/detached/remain * connected after unmount. */ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) { mnt_notify(m); list_del_init(&m->to_notify); } } static bool need_notify_mnt_list(void) { return !list_empty(&notify_list); } #else static void notify_mnt_list(void) { } static bool need_notify_mnt_list(void) { return false; } #endif static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); emptied_ns = NULL; if (need_notify_mnt_list()) { /* * No point blocking out concurrent readers while notifications * are sent. This will also allow statmount()/listmount() to run * concurrently. */ downgrade_write(&namespace_sem); notify_mnt_list(); up_read(&namespace_sem); } else { up_write(&namespace_sem); } if (unlikely(ns)) { /* Make sure we notice when we leak mounts. */ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); } shrink_dentry_list(&list); if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } } static inline void namespace_lock(void) { down_write(&namespace_sem); } enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { /* Leaving mounts connected is only valid for lazy umounts */ if (how & UMOUNT_SYNC) return true; /* A mount without a parent has nothing to be connected to */ if (!mnt_has_parent(mnt)) return true; /* Because the reference counting rules change when mounts are * unmounted and connected, umounted mounts may not be * connected to mounted mounts. */ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) return true; /* Has it been requested that the mount remain connected? */ if (how & UMOUNT_CONNECTED) return false; /* Is the mount locked such that it needs to remain connected? */ if (IS_MNT_LOCKED(mnt)) return false; /* By default disconnect the mount */ return true; } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { LIST_HEAD(tmp_list); struct mount *p; if (how & UMOUNT_PROPAGATE) propagate_mount_unlock(mnt); /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) move_from_ns(p); list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); } /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); bulk_make_private(&tmp_list); while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); } } if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); /* * At this point p->mnt_ns is NULL, notification will be queued * only if * * - p->prev_ns is non-NULL *and* * - p->prev_ns->n_fsnotify_marks is non-NULL * * This will preclude queuing the mount if this is a cleanup * after a failed copy_tree() or destruction of an anonymous * namespace, etc. */ mnt_notify_add(p); } } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { int ret = 0; down_write(&sb->s_umount); if (!sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, SB_RDONLY); if (IS_ERR(fc)) { ret = PTR_ERR(fc); } else { ret = parse_monolithic_mount_data(fc, NULL); if (!ret) ret = reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); return ret; } static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; /* * Allow userspace to request a mountpoint be expired rather than * unmounting unconditionally. Unmount only happens if: * (1) the mark is already set (the mark is cleared by mntput()) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; /* * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ lock_mount_hash(); if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; } /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */ if (flags & MNT_FORCE && sb->s_op->umount_begin) { sb->s_op->umount_begin(sb); } /* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return do_umount_root(sb); } namespace_lock(); lock_mount_hash(); /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; if (!check_mnt(mnt)) goto out; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; if (!mnt_has_parent(mnt)) /* not the absolute root */ goto out; event++; if (flags & MNT_DETACH) { umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } out: unlock_mount_hash(); namespace_unlock(); return retval; } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { struct pinned_mountpoint mp = {}; struct mount *mnt; guard(namespace_excl)(); guard(mount_writer)(); if (!lookup_mountpoint(dentry, &mp)) return; event++; while (mp.node.next) { mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); } /* * Is the caller allowed to modify his namespace? */ bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } static void warn_mandlock(void) { pr_warn_once("=======================================================\n" "WARNING: The mand mount option has been deprecated and\n" " and is ignored by this kernel. Remove the mand\n" " option from the mount to silence this warning.\n" "=======================================================\n"); } static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } // caller is responsible for flags being sane int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; ret = can_umount(path, flags); if (!ret) ret = do_umount(mnt, flags); /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path->dentry); mntput_no_expire(mnt); return ret; } static int ksys_umount(char __user *name, int flags) { int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; int ret; // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (ret) return ret; return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { return ksys_umount(name, flags); } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { return ksys_umount(name, 0); } #endif static bool is_mnt_ns_file(struct dentry *dentry) { struct ns_common *ns; /* Is this a proxy for a mount namespace? */ if (dentry->d_op != &ns_dentry_operations) return false; ns = d_inode(dentry)->i_private; return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { struct ns_common *ns; guard(rcu)(); for (;;) { ns = ns_tree_adjoined_rcu(mntns, previous); if (IS_ERR(ns)) return ERR_CAST(ns); mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU * delay so accessing the mount namespace is not just * safe but all relevant members are still valid. */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* * We need an active reference count as we're persisting * the mount namespace and it might already be on its * deathbed. */ if (!ns_ref_get(mntns)) continue; return mntns; } } struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) { if (!is_mnt_ns_file(dentry)) return NULL; return to_mnt_ns(get_proc_ns(dentry->d_inode)); } static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); if (!mnt_ns) return false; return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { struct mount *res, *src_parent, *src_root_child, *src_mnt, *dst_parent, *dst_mnt; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = dst_mnt = clone_mnt(src_root, dentry, flag); if (IS_ERR(dst_mnt)) return dst_mnt; src_parent = src_root; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) continue; for (src_mnt = src_root_child; src_mnt; src_mnt = next_mnt(src_mnt, src_root_child)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_mnt)) { if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ dst_mnt = ERR_PTR(-EPERM); goto out; } else { src_mnt = skip_mnt_tree(src_mnt); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(src_mnt->mnt.mnt_root)) { src_mnt = skip_mnt_tree(src_mnt); continue; } while (src_parent != src_mnt->mnt_parent) { src_parent = src_parent->mnt_parent; dst_mnt = dst_mnt->mnt_parent; } src_parent = src_mnt; dst_parent = dst_mnt; dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); if (src_mnt->mnt.mnt_flags & MNT_LOCKED) dst_mnt->mnt.mnt_flags |= MNT_LOCKED; if (unlikely(flag & CL_EXPIRE)) { /* stick the duplicate mount on the same expiry * list as the original if that was on one */ if (!list_empty(&src_mnt->mnt_expire)) list_add(&dst_mnt->mnt_expire, &src_mnt->mnt_expire); } attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } return res; out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return dst_mnt; } static inline bool extend_array(struct path **res, struct path **to_free, unsigned n, unsigned *count, unsigned new_count) { struct path *p; if (likely(n < *count)) return true; p = kmalloc_objs(struct path, new_count); if (p && *count) memcpy(p, *res, *count * sizeof(struct path)); *count = new_count; kfree(*to_free); *to_free = *res = p; return p; } const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); struct mount *child; struct path *res = prealloc, *to_free = NULL; unsigned n = 0; guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); if (!extend_array(&res, &to_free, 0, &count, 32)) return ERR_PTR(-ENOMEM); res[n++] = *path; list_for_each_entry(child, &root->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, path->dentry)) continue; for (struct mount *m = child; m; m = next_mnt(m, child)) { if (!extend_array(&res, &to_free, n, &count, 2 * count)) return ERR_PTR(-ENOMEM); res[n].mnt = &m->mnt; res[n].dentry = m->mnt.mnt_root; n++; } } if (!extend_array(&res, &to_free, n, &count, count + 1)) return ERR_PTR(-ENOMEM); memset(res + n, 0, (count - n) * sizeof(struct path)); for (struct path *p = res; p->mnt; p++) path_get(p); return res; } void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); } static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { struct mount *m = real_mount(mnt); /* * m used to be the root of anon namespace; if it still is one, * we need to dissolve the mount tree and free that namespace. * Let's try to avoid taking namespace_sem if we can determine * that there's nothing to do without it - rcu_read_lock() is * enough to make anon_ns_root() memory-safe and once m has * left its namespace, it's no longer our concern, since it will * never become a root of anon ns again. */ scoped_guard(rcu) { if (!anon_ns_root(m)) return; } scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; emptied_ns = m->mnt_ns; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } } /* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry)) continue; if (child->mnt.mnt_flags & MNT_LOCKED) return true; } return false; } bool has_locked_children(struct mount *mnt, struct dentry *dentry) { guard(mount_locked_reader)(); return __has_locked_children(mnt, dentry); } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. * * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) return false; return true; } /** * clone_private_mount - create a private clone of a path * @path: path to clone * * This creates a new vfsmount, which will be the clone of @path. The new mount * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * * This assumes caller has called or done the equivalent of may_mount(). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); /* * Make sure the source mount is acceptable. * Anything mounted in our mount namespace is allowed. * Otherwise, it must be the root of an anonymous mount * namespace, and we need to make sure no namespace * loops get created. */ if (!check_mnt(old_mnt)) { if (!anon_ns_root(old_mnt)) return ERR_PTR(-EINVAL); if (!check_for_nsfs_mounts(old_mnt)) return ERR_PTR(-EINVAL); } if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); if (IS_ERR(new_mnt)) return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); static void lock_mnt_tree(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { int flags = p->mnt.mnt_flags; /* Don't allow unprivileged users to change mount flags */ flags |= MNT_LOCK_ATIME; if (flags & MNT_READONLY) flags |= MNT_LOCK_READONLY; if (flags & MNT_NODEV) flags |= MNT_LOCK_NODEV; if (flags & MNT_NOSUID) flags |= MNT_LOCK_NOSUID; if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) mnt_release_group_id(p); } } static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); return err; } } } return 0; } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; if (ns->nr_mounts >= max) return -ENOSPC; max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; if (mounts > max) return -ENOSPC; ns->pending_mounts += mounts; return 0; } enum mnt_tree_flags_t { MNT_TREE_BENEATH = BIT(0), MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. * * Context: The function expects namespace_lock() to be held. * Return: If @source_mnt was successfully attached 0 is returned. * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *dest_mnt = dest->parent; struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; struct mountpoint *shorter = NULL; struct mount *child, *p; struct mount *top; struct hlist_node *n; int err = 0; bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) shorter = top->mnt_mp; } err = get_mountpoint(top->mnt.mnt_root, &root); if (err) return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; } if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); } lock_mount_hash(); if (err) goto out_cleanup_ids; if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } if (moving) { umount_mnt(source_mnt); mnt_notify_add(source_mnt); /* if the mount is moved, it should no longer be expired * automatically */ list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) move_from_ns(p); } } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); /* * Now the original copy is in the same state as the secondaries - * its root attached to mountpoint, but not hashed and all mounts * in it are either in our namespace or in no namespace at all. * Add the original to the list of copies and deal with the * rest of work for all of them uniformly. */ hlist_add_head(&source_mnt->mnt_hash, &tree_list); hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); commit_tree(child); if (q) { struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); } } unpin_mountpoint(&root); unlock_mount_hash(); return 0; out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; } static inline struct mount *where_to_mount(const struct path *path, struct dentry **dentry, bool beneath) { struct mount *m; if (unlikely(beneath)) { m = topmost_overmount(real_mount(path->mnt)); *dentry = m->mnt_mountpoint; return m->mnt_parent; } m = __lookup_mnt(path->mnt, path->dentry); if (unlikely(m)) { m = topmost_overmount(m); *dentry = m->mnt.mnt_root; return m; } *dentry = path->dentry; return real_mount(path->mnt); } /** * do_lock_mount - acquire environment for mounting * @path: target path * @res: context to set up * @beneath: whether the intention is to mount beneath @path * * To mount something at given location, we need * namespace_sem locked exclusive * inode of dentry we are mounting on locked exclusive * struct mountpoint for that dentry * struct mount we are mounting on * * Results are stored in caller-supplied context (pinned_mountpoint); * on success we have res->parent and res->mp pointing to parent and * mountpoint respectively and res->node inserted into the ->m_list * of the mountpoint, making sure the mountpoint won't disappear. * On failure we have res->parent set to ERR_PTR(-E...), res->mp * left NULL, res->node - empty. * In case of success do_lock_mount returns with locks acquired (in * proper order - inode lock nests outside of namespace_sem). * * Request to mount on overmounted location is treated as "mount on * top of whatever's overmounting it"; request to mount beneath * a location - "mount immediately beneath the topmost mount at that * place". * * In all cases the location must not have been unmounted and the * chosen mountpoint must be allowed to be mounted on. For "beneath" * case we also require the location to be at the root of a mount * that has a parent (i.e. is not a root of some namespace). */ static void do_lock_mount(const struct path *path, struct pinned_mountpoint *res, bool beneath) { int err; if (unlikely(beneath) && !path_mounted(path)) { res->parent = ERR_PTR(-EINVAL); return; } do { struct dentry *dentry, *d; struct mount *m, *n; scoped_guard(mount_locked_reader) { m = where_to_mount(path, &dentry, beneath); if (&m->mnt != path->mnt) { mntget(&m->mnt); dget(dentry); } } inode_lock(dentry->d_inode); namespace_lock(); // check if the chain of mounts (if any) has changed. scoped_guard(mount_locked_reader) n = where_to_mount(path, &d, beneath); if (unlikely(n != m || dentry != d)) err = -EAGAIN; // something moved, retry else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) err = -ENOENT; // not to be mounted on else if (beneath && &m->mnt == path->mnt && !m->overmount) err = -EINVAL; else err = get_mountpoint(dentry, res); if (unlikely(err)) { res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); } else { res->parent = m; } /* * Drop the temporary references. This is subtle - on success * we are doing that under namespace_sem, which would normally * be forbidden. However, in that case we are guaranteed that * refcounts won't reach zero, since we know that path->mnt * is mounted and thus all mounts reachable from it are pinned * and stable, along with their mountpoints and roots. */ if (&m->mnt != path->mnt) { dput(dentry); mntput(&m->mnt); } } while (err == -EAGAIN); } static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); namespace_unlock(); } static inline void unlock_mount(struct pinned_mountpoint *m) { if (!IS_ERR(m->parent)) __unlock_mount(m); } static void lock_mount_exact(const struct path *path, struct pinned_mountpoint *mp, bool copy_mount, unsigned int copy_flags); #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ do_lock_mount((path), &mp, (beneath)) #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) #define LOCK_MOUNT_EXACT(mp, path) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ lock_mount_exact((path), &mp, false, 0) #define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ lock_mount_exact((path), &mp, true, (copy_flags)) static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) { struct mnt_namespace *ns = m->mnt_ns; // it must be mounted in some namespace if (IS_ERR_OR_NULL(ns)) // is_mounted() return -EINVAL; // and the caller must be admin in userns of that namespace if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return 0; /* Only one propagation flag should be set */ if (!is_power_of_2(type)) return 0; return type; } /* * recursively change the type of the mountpoint. */ static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; int err; if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; guard(namespace_excl)(); err = may_change_propagation(mnt); if (err) return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); return 0; } /* may_copy_tree() - check if a mount tree can be copied * @path: path to the mount tree to be copied * * This helper checks if the caller may copy the mount tree starting * from @path->mnt. The caller may copy the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller tries to copy an nfs mount referring to a mount * namespace, i.e., the caller is trying to copy a mount namespace * entry from nsfs. * (3) The caller tries to copy a pidfs mount referring to a pidfd. * (4) The caller is trying to copy a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * This is not strictly necessary. The current semantics of the new * mount api enforce that the caller must be located in the same * mount namespace as the mount tree it interacts with. Using the * origin sequence number preserves these semantics even for * anonymous mount namespaces. However, one could envision extending * the api to directly operate across mount namespace if needed. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to copy the mount tree. * * Returns true if the mount tree can be copied, false otherwise. */ static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; if (check_mnt(mnt)) return true; d_op = path->dentry->d_op; if (d_op == &ns_dentry_operations) return true; if (d_op == &pidfs_dentry_operations) return true; if (!is_mounted(path->mnt)) return false; return check_anonymous_mnt(mnt); } static struct mount *__do_loopback(const struct path *old_path, unsigned int flags, unsigned int copy_flags) { struct mount *old = real_mount(old_path->mnt); bool recurse = flags & AT_RECURSIVE; if (IS_MNT_UNBINDABLE(old)) return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) return ERR_PTR(-EINVAL); if (!recurse && __has_locked_children(old, old_path->dentry)) return ERR_PTR(-EINVAL); /* * When creating a new mount namespace we don't want to copy over * mounts of mount namespaces to avoid the risk of cycles and also to * minimize the default complex interdependencies between mount * namespaces. * * We could ofc just check whether all mount namespace files aren't * creating cycles but really let's keep this simple. */ if (!(flags & OPEN_TREE_NAMESPACE)) copy_flags |= CL_COPY_MNT_NS_FILE; if (recurse) return copy_tree(old, old_path->dentry, copy_flags); return clone_mnt(old, old_path->dentry, copy_flags); } /* * do loopback mount. */ static int do_loopback(const struct path *path, const char *old_name, int recurse) { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; unsigned int flags = recurse ? AT_RECURSIVE : 0; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; if (mnt_ns_loop(old_path.dentry)) return -EINVAL; LOCK_MOUNT(mp, path); if (IS_ERR(mp.parent)) return PTR_ERR(mp.parent); if (!check_mnt(mp.parent)) return -EINVAL; mnt = __do_loopback(&old_path, flags, 0); if (IS_ERR(mnt)) return PTR_ERR(mnt); err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } return err; } static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ns; guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. * This needs to hold namespace_sem to ensure that the mount * doesn't get attached. */ if (is_mounted(path->mnt)) { src_mnt_ns = real_mount(path->mnt)->mnt_ns; if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, flags, 0); if (IS_ERR(mnt)) { emptied_ns = ns; return ERR_CAST(mnt); } for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; return ns; } static struct file *open_detached_copy(struct path *path, unsigned int flags) { struct mnt_namespace *ns = get_detached_copy(path, flags); struct file *file; if (IS_ERR(ns)) return ERR_CAST(ns); mntput(path->mnt); path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); else file->f_mode |= FMODE_NEED_UNMOUNT; return file; } static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct user_namespace *user_ns = current_user_ns(); struct mnt_namespace *new_ns; struct mount *new_ns_root, *old_ns_root; struct path to_path; struct mount *mnt; unsigned int copy_flags = 0; bool locked = false; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return ERR_CAST(new_ns); old_ns_root = ns->root; to_path.mnt = &old_ns_root->mnt; to_path.dentry = old_ns_root->mnt.mnt_root; VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); if (IS_ERR(mp.parent)) { free_mnt_ns(new_ns); return ERR_CAST(mp.parent); } new_ns_root = mp.parent; /* * If the real rootfs had a locked mount on top of it somewhere * in the stack, lock the new mount tree as well so it can't be * exposed. */ mnt = old_ns_root; while (mnt->overmount) { mnt = mnt->overmount; if (mnt->mnt.mnt_flags & MNT_LOCKED) locked = true; } /* * We don't emulate unshare()ing a mount namespace. We stick * to the restrictions of creating detached bind-mounts. It * has a lot saner and simpler semantics. */ mnt = __do_loopback(path, flags, copy_flags); scoped_guard(mount_writer) { if (IS_ERR(mnt)) { emptied_ns = new_ns; umount_tree(new_ns_root, 0); return ERR_CAST(mnt); } if (locked) mnt->mnt.mnt_flags |= MNT_LOCKED; /* * now mount the detached tree on top of the copy * of the real rootfs we created. */ attach_mnt(mnt, new_ns_root, mp.mp); if (user_ns != ns->user_ns) lock_mnt_tree(new_ns_root); } for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { mnt_add_to_ns(new_ns, mnt); new_ns->nr_mounts++; } new_ns->root = new_ns_root; ns_tree_add_raw(new_ns); return new_ns; } static struct file *open_new_namespace(struct path *path, unsigned int flags) { struct mnt_namespace *new_ns; new_ns = create_new_namespace(path, flags); if (IS_ERR(new_ns)) return ERR_CAST(new_ns); return open_namespace_file(to_ns_common(new_ns)); } static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { int ret; struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE)) return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) == AT_RECURSIVE) return ERR_PTR(-EINVAL); if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1) return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; /* * If we create a new mount namespace with the cloned mount tree we * just care about being privileged over our current user namespace. * The new mount namespace will be owned by it. */ if ((flags & OPEN_TREE_NAMESPACE) && !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if ((flags & OPEN_TREE_CLONE) && !may_mount()) return ERR_PTR(-EPERM); CLASS(filename_uflags, name)(filename, flags); ret = filename_lookup(dfd, name, lookup_flags, &path, NULL); if (unlikely(ret)) return ERR_PTR(ret); if (flags & OPEN_TREE_NAMESPACE) return open_new_namespace(&path, flags); if (flags & OPEN_TREE_CLONE) return open_detached_copy(&path, flags); return dentry_open(&path, O_PATH, current_cred()); } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { return FD_ADD(flags, vfs_open_tree(dfd, filename, flags)); } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { unsigned int fl = mnt->mnt.mnt_flags; if ((fl & MNT_LOCK_READONLY) && !(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) return false; if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) return false; if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC)) return false; if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) return false; return true; } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { bool readonly_request = (mnt_flags & MNT_READONLY); if (readonly_request == __mnt_is_readonly(&mnt->mnt)) return 0; if (readonly_request) return mnt_make_readonly(mnt); mnt->mnt.mnt_flags &= ~MNT_READONLY; return 0; } static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); } static void mnt_warn_timestamp_expiry(const struct path *mountpoint, struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf, *mntpath; buf = (char *)__get_free_page(GFP_KERNEL); if (buf) mntpath = d_path(mountpoint, buf, PAGE_SIZE); else mntpath = ERR_PTR(-ENOMEM); if (IS_ERR(mntpath)) mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; if (buf) free_page((unsigned long)buf); } } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); int ret; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; /* * We're only checking whether the superblock is read-only not * changing it, so only take down_read(&sb->s_umount). */ down_read(&sb->s_umount); lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); return ret; } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(const struct path *path, int sb_flags, int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the remount request is coming * from the legacy mount system call. */ fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); if (!err) { lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); } } up_write(&sb->s_umount); } mnt_warn_timestamp_expiry(path, &mnt->mnt); put_fs_context(fc); return err; } static inline int tree_contains_unbindable(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; } return 0; } static int do_set_group(const struct path *from_path, const struct path *to_path) { struct mount *from = real_mount(from_path->mnt); struct mount *to = real_mount(to_path->mnt); int err; guard(namespace_excl)(); err = may_change_propagation(from); if (err) return err; err = may_change_propagation(to); if (err) return err; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) return -EINVAL; if (!path_mounted(to_path)) return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } return 0; } /** * path_overmounted - check if path is overmounted * @path: path to check * * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * * Context: namespace_sem must be held at least shared. * MUST NOT be called under lock_mount_hash() (there one should just * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { unsigned seq = read_seqbegin(&mount_lock); bool no_child; rcu_read_lock(); no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); if (need_seqretry(&mount_lock, seq)) { read_seqlock_excl(&mount_lock); no_child = !__lookup_mnt(path->mnt, path->dentry); read_sequnlock_excl(&mount_lock); } return unlikely(!no_child); } /* * Check if there is a possibly empty chain of descent from p1 to p2. * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). */ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) { while (p2 != p1 && mnt_has_parent(p2)) p2 = p2->mnt_parent; return p2 == p1; } /** * can_move_mount_beneath - check that we can mount beneath the top mount * @mnt_from: mount we are trying to move * @mnt_to: mount under which to mount * @mp: mountpoint of @mnt_to * * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to * nonsensical mount trees. * * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ static int can_move_mount_beneath(const struct mount *mnt_from, const struct mount *mnt_to, const struct mountpoint *mp) { struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ if (mnt_from->overmount) return -EINVAL; /* * Mounting beneath the rootfs only makes sense when the * semantics of pivot_root(".", ".") are used. */ if (&mnt_to->mnt == current->fs->root.mnt) return -EINVAL; if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; if (mount_is_ancestor(mnt_to, mnt_from)) return -EINVAL; /* * If the parent mount propagates to the child mount this would * mean mounting @mnt_from on @mnt_to->mnt_parent and then * propagating a copy @c of @mnt_from on top of @mnt_to. This * defeats the whole purpose of mounting beneath another mount. */ if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) return -EINVAL; /* * If @mnt_to->mnt_parent propagates to @mnt_from this would * mean propagating a copy @c of @mnt_from on top of @mnt_from. * Afterwards @mnt_from would be mounted on top of * @mnt_to->mnt_parent and @mnt_to would be unmounted from * @mnt->mnt_parent and remounted on @mnt_from. But since @c is * already mounted on @mnt_from, @mnt_to would ultimately be * remounted on top of @c. Afterwards, @mnt_from would be * covered by a copy @c of @mnt_from and @c would be covered by * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ if (check_mnt(mnt_from) && propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; } /* may_use_mount() - check if a mount tree can be used * @mnt: vfsmount to be used * * This helper checks if the caller may use the mount tree starting * from @path->mnt. The caller may use the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller is trying to use a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to use the mount tree. * * Returns true if the mount tree can be used, false otherwise. */ static inline bool may_use_mount(struct mount *mnt) { if (check_mnt(mnt)) return true; /* * Make sure that noone unmounted the target path or somehow * managed to get their hands on something purely kernel * internal. */ if (!is_mounted(&mnt->mnt)) return false; return check_anonymous_mnt(mnt); } static int do_move_mount(const struct path *old_path, const struct path *new_path, enum mnt_tree_flags_t flags) { struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; if (!path_mounted(old_path)) return -EINVAL; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) return -EINVAL; LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); if (IS_ERR(mp.parent)) return PTR_ERR(mp.parent); if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) return -EINVAL; /* ... which should not be shared */ if (IS_MNT_SHARED(old->mnt_parent)) return -EINVAL; /* ... and the target should be in our namespace */ if (!check_mnt(mp.parent)) return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ if (old->mnt_ns == mp.parent->mnt_ns) return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ if (!may_use_mount(mp.parent)) return -EINVAL; } if (beneath) { struct mount *over = real_mount(new_path->mnt); if (mp.parent != over->mnt_parent) over = mp.parent->overmount; err = can_move_mount_beneath(old, over, mp.mp); if (err) return err; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) return -EINVAL; if (!check_for_nsfs_mounts(old)) return -ELOOP; if (mount_is_ancestor(old, mp.parent)) return -ELOOP; return attach_recursive_mnt(old, &mp); } static int do_move_mount_old(const struct path *path, const char *old_name) { struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; return do_move_mount(&old_path, path, 0); } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, int mnt_flags) { struct mount *parent = mp->parent; if (IS_ERR(parent)) return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) return -EINVAL; } /* Refuse the same filesystem on the same mount point */ if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { struct super_block *sb; struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; if (IS_ERR(mnt)) return PTR_ERR(mnt); sb = fc->root->d_sb; error = security_sb_kern_mount(sb); if (unlikely(error)) return error; if (unlikely(mount_too_revealing(sb, &mnt_flags))) { errorfcp(fc, "VFS", "Mount too revealing"); return -EPERM; } mnt_warn_timestamp_expiry(mountpoint, mnt); LOCK_MOUNT(mp, mountpoint); error = do_add_mount(real_mount(mnt), &mp, mnt_flags); if (!error) retain_and_null_ptr(mnt); // consumed on success return error; } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(const struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; const char *subtype = NULL; int err = 0; if (!fstype) return -EINVAL; type = get_fs_type(fstype); if (!type) return -ENODEV; if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.'); if (subtype) { subtype++; if (!*subtype) { put_filesystem(type); return -EINVAL; } } } fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the mount request is coming * from the legacy mount system call. */ fc->oldapi = true; if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } static void lock_mount_exact(const struct path *path, struct pinned_mountpoint *mp, bool copy_mount, unsigned int copy_flags) { struct dentry *dentry = path->dentry; int err; /* Assert that inode_lock() locked the correct inode. */ VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) err = -ENOENT; else if (!copy_mount && path_overmounted(path)) err = -EBUSY; else err = get_mountpoint(dentry, mp); if (unlikely(err)) { namespace_unlock(); inode_unlock(dentry->d_inode); mp->parent = ERR_PTR(err); return; } if (copy_mount) mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); else mp->parent = real_mount(path->mnt); if (unlikely(IS_ERR(mp->parent))) __unlock_mount(mp); } int finish_automount(struct vfsmount *__m, const struct path *path) { struct vfsmount *m __free(mntput) = __m; struct mount *mnt; int err; if (!m) return 0; if (IS_ERR(m)) return PTR_ERR(m); mnt = real_mount(m); if (m->mnt_root == path->dentry) return -ELOOP; /* * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ LOCK_MOUNT_EXACT(mp, path); if (mp.parent == ERR_PTR(-EBUSY)) return 0; err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); if (likely(!err)) retain_and_null_ptr(m); return err; } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); } EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); if (list_empty(mounts)) return; guard(namespace_excl)(); guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!is_mounted(&mnt->mnt)) continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { struct mount *this_parent = parent; struct list_head *next; int found = 0; repeat: next = this_parent->mnt_mounts.next; resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. */ if (!list_empty(&mnt->mnt_mounts)) { this_parent = mnt; goto repeat; } if (!propagate_mount_busy(mnt, 1)) { list_move_tail(&mnt->mnt_expire, graveyard); found++; } } /* * All done at this level ... ascend and resume the search */ if (this_parent != parent) { next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } return found; } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } static void *copy_mount_options(const void __user * data) { char *copy; unsigned left, offset; if (!data) return NULL; copy = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!copy) return ERR_PTR(-ENOMEM); left = copy_from_user(copy, data, PAGE_SIZE); /* * Not all architectures have an exact copy_from_user(). Resort to * byte at a time. */ offset = PAGE_SIZE - left; while (left) { char c; if (get_user(c, (const char __user *)data + offset)) break; copy[offset] = c; left--; offset++; } if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } return copy; } static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER) return -EINVAL; ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret) return ret; if (!may_mount()) return -EPERM; if (flags & SB_MANDLOCK) warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | SB_SYNCHRONOUS | SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | SB_POSIXACL | SB_LAZYTIME | SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE) return do_move_mount_old(path, dev_name); return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, data_page); } int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); } static void dec_mnt_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); } static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; int ret; ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); new_ns = kzalloc_obj(struct mnt_namespace, GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } if (anon) ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } ns_tree_gen_id(new_ns); new_ns->is_anon = anon; refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; } __latent_entropy struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt __free(mntput) = NULL; struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags; BUG_ON(!ns); if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; } old = ns->root; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; return ERR_CAST(new); } if (user_ns != ns->user_ns) { guard(mount_writer)(); lock_mnt_tree(new); } new_ns->root = new; /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ p = old; q = new; while (p) { mnt_add_to_ns(new_ns, q); new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } ns_tree_add_raw(new_ns); return new_ns; } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) { mntput(m); return ERR_CAST(ns); } ns->root = mnt; ns->nr_mounts++; mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); if (err) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dev; void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); if (IS_ERR(kernel_type)) goto out_type; kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) goto out_dev; options = copy_mount_options(data); ret = PTR_ERR(options); if (IS_ERR(options)) goto out_data; ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); kfree(options); out_data: kfree(kernel_dev); out_dev: kfree(kernel_type); out_type: return ret; } #define FSMOUNT_VALID_FLAGS \ (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) #define MOUNT_SETATTR_PROPAGATION_FLAGS \ (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { unsigned int mnt_flags = 0; if (attr_flags & MOUNT_ATTR_RDONLY) mnt_flags |= MNT_READONLY; if (attr_flags & MOUNT_ATTR_NOSUID) mnt_flags |= MNT_NOSUID; if (attr_flags & MOUNT_ATTR_NODEV) mnt_flags |= MNT_NODEV; if (attr_flags & MOUNT_ATTR_NOEXEC) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { struct path new_path __free(path_put) = {}; struct mnt_namespace *ns; struct fs_context *fc; struct vfsmount *new_mnt; struct mount *mnt; unsigned int mnt_flags = 0; long ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: break; case MOUNT_ATTR_NOATIME: mnt_flags |= MNT_NOATIME; break; case MOUNT_ATTR_RELATIME: mnt_flags |= MNT_RELATIME; break; default: return -EINVAL; } CLASS(fd, f)(fs_fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex); ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex); if (ret) return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) return ret; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { errorfcp(fc, "VFS", "Mount too revealing"); return ret; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) return ret; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); new_mnt = vfs_create_mount(fc); if (IS_ERR(new_mnt)) return PTR_ERR(new_mnt); new_mnt->mnt_flags = mnt_flags; new_path.dentry = dget(fc->root); new_path.mnt = new_mnt; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to * do any memory allocation or anything like that at this point as we * don't want to have to handle any errors incurred. */ vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) return PTR_ERR(ns); mnt = real_mount(new_path.mnt); ns->root = mnt; ns->nr_mounts = 1; mnt_add_to_ns(ns, mnt); mntget(new_path.mnt); FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, dentry_open(&new_path, O_PATH, fc->cred)); if (fdf.err) { dissolve_on_fput(new_path.mnt); return fdf.err; } /* * Attach to an apparent O_PATH fd with a note that we * need to unmount it, not just simply put it. */ fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT; return fd_publish(fdf); } static inline int vfs_move_mount(const struct path *from_path, const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; ret = security_move_mount(from_path, to_path); if (ret) return ret; if (mflags & MNT_TREE_PROPAGATION) return do_set_group(from_path, to_path); return do_move_mount(from_path, to_path, mflags); } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path to_path __free(path_put) = {}; struct path from_path __free(path_put) = {}; unsigned int lflags, uflags; enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) return -EPERM; if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; uflags = 0; if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; CLASS(filename_maybe_null,to_name)(to_pathname, uflags); if (!to_name && to_dfd >= 0) { CLASS(fd_raw, f_to)(to_dfd); if (fd_empty(f_to)) return -EBADF; to_path = fd_file(f_to)->f_path; path_get(&to_path); } else { lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); if (ret) return ret; } uflags = 0; if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; CLASS(filename_maybe_null,from_name)(from_pathname, uflags); if (!from_name && from_dfd >= 0) { CLASS(fd_raw, f_from)(from_dfd); if (fd_empty(f_from)) return -EBADF; return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); } lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); if (ret) return ret; return vfs_move_mount(&from_path, &to_path, mflags); } /* * Return true if path is reachable from root * * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; } return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } bool path_is_under(const struct path *path1, const struct path *path2) { guard(mount_locked_reader)(); return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); int path_pivot_root(struct path *new, struct path *old) { struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; int error; if (!may_mount()) return -EPERM; error = security_sb_pivotroot(old, new); if (error) return error; get_fs_root(current->fs, &root); LOCK_MOUNT(old_mp, old); old_mnt = old_mp.parent; if (IS_ERR(old_mnt)) return PTR_ERR(old_mnt); new_mnt = real_mount(new->mnt); root_mnt = real_mount(root.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) return -EINVAL; if (d_unlinked(new->dentry)) return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) return -EINVAL; /* absolute root */ if (!path_mounted(new)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new)) return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new->dentry, &root)) return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount new_root on / */ attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); umount_mnt(root_mnt); /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, new); return 0; } /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * The immutable nullfs filesystem is mounted as the true root of the VFS * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this, * allowing pivot_root() to work normally from initramfs. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new __free(path_put) = {}; struct path old __free(path_put) = {}; int error; error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) return error; return path_pivot_root(&new, &old); } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; /* flags to clear */ flags &= ~kattr->attr_clr; /* flags to raise */ flags |= kattr->attr_set; return flags; } static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* * We only allow an mount to change it's idmapping if it has * never been accessible to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; /* The filesystem has turned off idmapped mounts. */ if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) return -EINVAL; /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ if (!is_anon_ns(mnt->mnt_ns)) return -EINVAL; return 0; } /** * mnt_allow_writers() - check whether the attribute change allows writers * @kattr: the new mount attributes * @mnt: the mount to which @kattr will be applied * * Check whether thew new mount attributes in @kattr allow concurrent writers. * * Return: true if writers need to be held, false if not */ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; int err; for (m = mnt; m; m = next_mnt(m, mnt)) { if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { err = -EPERM; break; } err = can_idmap_mount(kattr, m); if (err) break; if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); if (err) { m = next_mnt(m, mnt); break; } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } if (err) { /* undo all mnt_hold_writers() we'd done */ for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) mnt_unhold_writers(p); } return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct mnt_idmap *old_idmap; if (!kattr->mnt_idmap) return; old_idmap = mnt_idmap(&mnt->mnt); /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; for (m = mnt; m; m = next_mnt(m, mnt)) { unsigned int flags; do_idmap_mount(kattr, m); flags = recalc_flags(kattr, m); WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { struct mnt_idmap *mnt_idmap; mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); if (IS_ERR(mnt_idmap)) return PTR_ERR(mnt_idmap); kattr->mnt_idmap = mnt_idmap; } if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing * propagation. */ namespace_lock(); if (kattr->propagation == MS_SHARED) { err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; } } } err = -EINVAL; lock_mount_hash(); if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* * First, we get the mount tree in a shape where we can change mount * properties without failure. If we succeeded to do so we commit all * changes and if we failed we clean up. */ err = mount_setattr_prepare(kattr, mnt); if (!err) mount_setattr_commit(kattr, mnt); out: unlock_mount_hash(); if (kattr->propagation) { if (err) cleanup_group_ids(mnt, NULL); namespace_unlock(); } return err; } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; if (attr->attr_clr & MOUNT_ATTR_IDMAP) { /* * We can only remove an idmapping if it's never been * exposed to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) return -EINVAL; /* * Removal of idmappings is equivalent to setting * nop_mnt_idmap. */ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { kattr->mnt_idmap = &nop_mnt_idmap; return 0; } } if (attr->userns_fd > INT_MAX) return -EINVAL; CLASS(fd, f)(attr->userns_fd); if (fd_empty(f)) return -EBADF; if (!proc_ns_file(fd_file(f))) return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* * The initial idmapping cannot be used to create an idmapped * mount. We use the initial idmapping as an indicator of a mount * that is not idmapped. It can simply be passed into helpers that * are aware of idmapped mounts as a convenient shortcut. A user * can just create a dedicated identity mapping to achieve the same * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); if (mnt_userns == &init_user_ns) return -EPERM; /* We're not controlling the target namespace. */ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) return -EPERM; kattr->mnt_userns = get_user_ns(mnt_userns); return 0; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) return -EINVAL; kattr->propagation = attr->propagation; if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) return -EINVAL; kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); /* * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, * users wanting to transition to a different atime setting cannot * simply specify the atime setting in @attr_set, but must also * specify MOUNT_ATTR__ATIME in the @attr_clr field. * So ensure that MOUNT_ATTR__ATIME can't be partially set in * @attr_clr and that @attr_set can't have any atime bits set if * MOUNT_ATTR__ATIME isn't set in @attr_clr. */ if (attr->attr_clr & MOUNT_ATTR__ATIME) { if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) return -EINVAL; /* * Clear all previous time settings as they are mutually * exclusive. */ kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; switch (attr->attr_set & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_RELATIME: kattr->attr_set |= MNT_RELATIME; break; case MOUNT_ATTR_NOATIME: kattr->attr_set |= MNT_NOATIME; break; case MOUNT_ATTR_STRICTATIME: break; default: return -EINVAL; } } else { if (attr->attr_set & MOUNT_ATTR__ATIME) return -EINVAL; } return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { if (kattr->mnt_userns) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, struct mount_kattr *kattr) { int ret; struct mount_attr attr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) return -EINVAL; if (!may_mount()) return -EPERM; ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); if (ret) return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) return 0; /* Tell caller to not bother. */ ret = build_mount_kattr(&attr, usize, kattr); if (ret < 0) return ret; return 1; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, unsigned int, flags, struct mount_attr __user *, uattr, size_t, usize) { int err; struct path target; struct mount_kattr kattr; unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; if (flags & ~(AT_EMPTY_PATH | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) return -EINVAL; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; kattr = (struct mount_kattr) { .lookup_flags = lookup_flags, }; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; err = wants_mount_setattr(uattr, usize, &kattr); if (err <= 0) return err; CLASS(filename_uflags, name)(path, flags); err = filename_lookup(dfd, name, kattr.lookup_flags, &target, NULL); if (!err) { err = do_mount_setattr(&target, &kattr); path_put(&target); } finish_mount_kattr(&kattr); return err; } SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, unsigned, flags, struct mount_attr __user *, uattr, size_t, usize) { if (!uattr && usize) return -EINVAL; FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags)); if (fdf.err) return fdf.err; if (uattr) { struct mount_kattr kattr = {}; struct file *file = fd_prepare_file(fdf); int ret; if (flags & OPEN_TREE_CLONE) kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); finish_mount_kattr(&kattr); } if (ret) return ret; } return fd_publish(fdf); } int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) return root->d_sb->s_op->show_path(m, root); seq_dentry(m, root, " \t\n\\"); return 0; } static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) { struct mount *mnt = mnt_find_id_at(ns, id); if (!mnt || mnt->mnt_id_unique != id) return NULL; return &mnt->mnt; } struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; struct mnt_idmap *idmap; u64 mask; struct path root; struct seq_file seq; /* Must be last --ends in a flexible-array member. */ struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) { unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); u64 attr_flags = 0; if (mnt_flags & MNT_READONLY) attr_flags |= MOUNT_ATTR_RDONLY; if (mnt_flags & MNT_NOSUID) attr_flags |= MOUNT_ATTR_NOSUID; if (mnt_flags & MNT_NODEV) attr_flags |= MOUNT_ATTR_NODEV; if (mnt_flags & MNT_NOEXEC) attr_flags |= MOUNT_ATTR_NOEXEC; if (mnt_flags & MNT_NODIRATIME) attr_flags |= MOUNT_ATTR_NODIRATIME; if (mnt_flags & MNT_NOSYMFOLLOW) attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; if (mnt_flags & MNT_NOATIME) attr_flags |= MOUNT_ATTR_NOATIME; else if (mnt_flags & MNT_RELATIME) attr_flags |= MOUNT_ATTR_RELATIME; else attr_flags |= MOUNT_ATTR_STRICTATIME; if (is_idmapped_mnt(mnt)) attr_flags |= MOUNT_ATTR_IDMAP; return attr_flags; } static u64 mnt_to_propagation_flags(struct mount *m) { u64 propagation = 0; if (IS_MNT_SHARED(m)) propagation |= MS_SHARED; if (IS_MNT_SLAVE(m)) propagation |= MS_SLAVE; if (IS_MNT_UNBINDABLE(m)) propagation |= MS_UNBINDABLE; if (!propagation) propagation |= MS_PRIVATE; return propagation; } u64 vfsmount_to_propagation_flags(struct vfsmount *mnt) { return mnt_to_propagation_flags(real_mount(mnt)); } EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags); static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; s->sm.mask |= STATMOUNT_SB_BASIC; s->sm.sb_dev_major = MAJOR(sb->s_dev); s->sm.sb_dev_minor = MINOR(sb->s_dev); s->sm.sb_magic = sb->s_magic; s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); } static void statmount_mnt_basic(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_MNT_BASIC; s->sm.mnt_id = m->mnt_id_unique; s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; s->sm.mnt_id_old = m->mnt_id; s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } static void statmount_propagate_from(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_PROPAGATE_FROM; if (IS_MNT_SLAVE(m)) s->sm.propagate_from = get_dominating_id(m, &current->fs->root); } static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) { int ret; size_t start = seq->count; ret = show_path(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* * Unescape the result. It would be better if supplied string was not * escaped in the first place, but that's a pretty invasive change. */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); return 0; } static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; err = seq_path_root(seq, &mnt_path, &s->root, ""); return err == SEQ_SKIP ? 0 : err; } static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; seq_puts(seq, sb->s_type->name); return 0; } static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; if (sb->s_subtype) seq_puts(seq, sb->s_subtype); } static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; struct mount *r = real_mount(s->mnt); if (sb->s_op->show_devname) { size_t start = seq->count; int ret; ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* Unescape the result */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); } else { seq_puts(seq, r->mnt_devname); } return 0; } static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; } if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (seq->count == start) return 0; /* skip leading comma */ memmove(seq->buf + start, seq->buf + start + 1, seq->count - start - 1); seq->count--; return 0; } static inline int statmount_opt_process(struct seq_file *seq, size_t start) { char *buf_end, *opt_end, *src, *dst; int count = 0; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; buf_end = seq->buf + seq->count; dst = seq->buf + start; src = dst + 1; /* skip initial comma */ if (src >= buf_end) { seq->count = start; return 0; } *buf_end = '\0'; for (; src < buf_end; src = opt_end + 1) { opt_end = strchrnul(src, ','); *opt_end = '\0'; dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1; if (WARN_ON_ONCE(++count == INT_MAX)) return -EOVERFLOW; } seq->count = dst - 1 - seq->buf; return count; } static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; if (!sb->s_op->show_options) return 0; err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_num = err; return 0; } static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_sec_num = err; return 0; } static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, true); if (ret < 0) return ret; s->sm.mnt_uidmap_num = ret; /* * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_UIDMAP; return 0; } static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, false); if (ret < 0) return ret; s->sm.mnt_gidmap_num = ret; /* * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_GIDMAP; return 0; } static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; u32 start, *offp; /* Reserve an empty string at the beginning for any unset offsets */ if (!seq->count) seq_putc(seq, 0); start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: WARN_ON_ONCE(true); return -EINVAL; } /* * If nothing was emitted, return to avoid setting the flag * and terminating the buffer. */ if (seq->count == start) return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) return -EOVERFLOW; /* signal a retry */ if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (ret) return ret; seq->buf[seq->count++] = '\0'; sm->mask |= flag; *offp = start; return 0; } static int copy_statmount_to_user(struct kstatmount *s) { struct statmount *sm = &s->sm; struct seq_file *seq = &s->seq; char __user *str = ((char __user *)s->buf) + sizeof(*sm); size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); if (seq->count && copy_to_user(str, seq->buf, seq->count)) return -EFAULT; /* Return the number of bytes copied to the buffer */ sm->size = copysize + seq->count; if (copy_to_user(s->buf, sm, copysize)) return -EFAULT; return 0; } static struct mount *listmnt_next(struct mount *curr, bool reverse) { struct rb_node *node; if (reverse) node = rb_prev(&curr->mnt_node); else node = rb_next(&curr->mnt_node); return node_to_mount(node); } static int grab_requested_root(struct mnt_namespace *ns, struct path *root) { struct mount *first, *child; rwsem_assert_held(&namespace_sem); /* We're looking at our own ns, just use get_fs_root. */ if (ns == current->nsproxy->mnt_ns) { get_fs_root(current->fs, root); return 0; } /* * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; for (;;) { child = listmnt_next(child, false); if (!child) return -ENOENT; if (child->mnt_parent == first) break; } root->mnt = mntget(&child->mnt); root->dentry = dget(root->mnt->mnt_root); return 0; } /* This must be updated whenever a new flag is added */ #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ STATMOUNT_MNT_BASIC | \ STATMOUNT_PROPAGATE_FROM | \ STATMOUNT_MNT_ROOT | \ STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | \ STATMOUNT_MNT_NS_ID | \ STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | \ STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | \ STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_SUPPORTED_MASK | \ STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) /* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct file *mnt_file, struct mnt_namespace *ns) { int err; if (mnt_file) { WARN_ON_ONCE(ns != NULL); s->mnt = mnt_file->f_path.mnt; ns = real_mount(s->mnt)->mnt_ns; if (IS_ERR(ns)) return PTR_ERR(ns); if (!ns) /* * We can't set mount point and mnt_ns_id since we don't have a * ns for the mount. This can happen if the mount is unmounted * with MNT_DETACH. */ s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID); } else { /* Has the namespace already been emptied? */ if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); if (!s->mnt) return -ENOENT; } if (ns) { err = grab_requested_root(ns, &s->root); if (err) return err; if (!mnt_file) { struct mount *m; /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ m = real_mount(s->mnt); if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; } } err = security_sb_statfs(s->mnt->mnt_root); if (err) return err; /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the * namespace semaphore and mount properties may change with only * the mount lock held. * * We could sample the mount lock sequence counter to detect * those changes and retry. But it's not worth it. Worst that * happens is that the mnt->mnt_idmap pointer is already changed * while mnt->mnt_flags isn't or vica versa. So what. * * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved * via READ_ONCE()/WRITE_ONCE() and guard against theoretical * torn read/write. That's all we care about right now. */ s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); if (s->mask & STATMOUNT_FS_TYPE) err = statmount_string(s, STATMOUNT_FS_TYPE); if (!err && s->mask & STATMOUNT_MNT_ROOT) err = statmount_string(s, STATMOUNT_MNT_ROOT); if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); if (!err && s->mask & STATMOUNT_OPT_ARRAY) err = statmount_string(s, STATMOUNT_OPT_ARRAY); if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); if (!err && s->mask & STATMOUNT_MNT_UIDMAP) err = statmount_string(s, STATMOUNT_MNT_UIDMAP); if (!err && s->mask & STATMOUNT_MNT_GIDMAP) err = statmount_string(s, STATMOUNT_MNT_GIDMAP); if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { s->sm.mask |= STATMOUNT_SUPPORTED_MASK; s->sm.supported_mask = STATMOUNT_SUPPORTED; } if (err) return err; /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); return 0; } static inline bool retry_statmount(const long ret, size_t *seq_size) { if (likely(ret != -EAGAIN)) return false; if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) return false; if (unlikely(*seq_size > MAX_RW_COUNT)) return false; return true; } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, size_t seq_size) { if (!access_ok(buf, bufsize)) return -EFAULT; memset(ks, 0, sizeof(*ks)); ks->mask = kreq->param; ks->buf = buf; ks->bufsize = bufsize; if (ks->mask & STATMOUNT_STRING_REQ) { if (bufsize == sizeof(ks->sm)) return -EOVERFLOW; ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); if (!ks->seq.buf) return -ENOMEM; ks->seq.size = seq_size; } return 0; } static int copy_mnt_id_req(const struct mnt_id_req __user *req, struct mnt_id_req *kreq, unsigned int flags) { int ret; size_t usize; BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) return -EFAULT; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) return -EINVAL; memset(kreq, 0, sizeof(*kreq)); ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; if (flags & STATMOUNT_BY_FD) { if (kreq->mnt_id || kreq->mnt_ns_id) return -EINVAL; } else { if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; } return 0; } /* * If the user requested a specific mount namespace id, look that up and return * that, or if not simply grab a passive reference on our mount namespace and * return that. */ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) { struct mnt_namespace *mnt_ns; if (kreq->mnt_ns_id) { mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); if (!mnt_ns) return ERR_PTR(-ENOENT); } else if (kreq->mnt_ns_fd) { struct ns_common *ns; CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); if (!proc_ns_file(fd_file(f))) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); refcount_inc(&mnt_ns->passive); } else { mnt_ns = current->nsproxy->mnt_ns; refcount_inc(&mnt_ns->passive); } return mnt_ns; } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct kstatmount *ks __free(kfree) = NULL; struct file *mnt_file __free(fput) = NULL; struct mnt_id_req kreq; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; if (flags & ~STATMOUNT_BY_FD) return -EINVAL; ret = copy_mnt_id_req(req, &kreq, flags); if (ret) return ret; if (flags & STATMOUNT_BY_FD) { mnt_file = fget_raw(kreq.mnt_fd); if (!mnt_file) return -EBADF; /* do_statmount sets ns in case of STATMOUNT_BY_FD */ } else { ns = grab_requested_mnt_ns(&kreq); if (IS_ERR(ns)) return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; } ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); if (!ks) return -ENOMEM; retry: ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns); if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } struct klistmount { u64 last_mnt_id; u64 mnt_parent_id; u64 *kmnt_ids; u32 nr_mnt_ids; struct mnt_namespace *ns; struct path root; }; /* locks: namespace_shared */ static ssize_t do_listmount(struct klistmount *kls, bool reverse) { struct mnt_namespace *ns = kls->ns; u64 mnt_parent_id = kls->mnt_parent_id; u64 last_mnt_id = kls->last_mnt_id; u64 *mnt_ids = kls->kmnt_ids; size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) return -ENOENT; orig.dentry = orig.mnt->mnt_root; } /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; ret = security_sb_statfs(orig.dentry); if (ret) return ret; if (!last_mnt_id) { if (reverse) first = node_to_mount(ns->mnt_last_node); else first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); else first = mnt_find_id_at(ns, last_mnt_id + 1); } for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { if (r->mnt_id_unique == mnt_parent_id) continue; if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) continue; *mnt_ids = r->mnt_id_unique; mnt_ids++; nr_mnt_ids--; ret++; } return ret; } static void __free_klistmount_free(const struct klistmount *kls) { path_put(&kls->root); kvfree(kls->kmnt_ids); mnt_ns_release(kls->ns); } static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { u64 last_mnt_id = kreq->param; struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; kls->last_mnt_id = last_mnt_id; kls->nr_mnt_ids = nr_mnt_ids; kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), GFP_KERNEL_ACCOUNT); if (!kls->kmnt_ids) return -ENOMEM; ns = grab_requested_mnt_ns(kreq); if (IS_ERR(ns)) return PTR_ERR(ns); kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; } SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; struct mnt_id_req kreq; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) return -EINVAL; /* * If the mount namespace really has more than 1 million mounts the * caller must iterate over the mount namespace (and reconsider their * system design...). */ if (unlikely(nr_mnt_ids > maxcount)) return -EOVERFLOW; if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; ret = copy_mnt_id_req(req, &kreq, 0); if (ret) return ret; ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); if (ret) return ret; if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ scoped_guard(namespace_shared) ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } struct mnt_namespace init_mnt_ns = { .ns = NS_COMMON_INIT(init_mnt_ns), .user_ns = &init_user_ns, .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), }; static void __init init_mount_tree(void) { struct vfsmount *mnt, *nullfs_mnt; struct mount *mnt_root; struct path root; /* * We create two mounts: * * (1) nullfs with mount id 1 * (2) mutable rootfs with mount id 2 * * with (2) mounted on top of (1). */ nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); if (IS_ERR(nullfs_mnt)) panic("VFS: Failed to create nullfs"); mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); /* The namespace root is the nullfs mnt. */ mnt_root = real_mount(nullfs_mnt); init_mnt_ns.root = mnt_root; /* Mount mutable rootfs on top of nullfs. */ root.mnt = nullfs_mnt; root.dentry = nullfs_mnt->mnt_root; LOCK_MOUNT_EXACT(mp, &root); if (unlikely(IS_ERR(mp.parent))) panic("VFS: Failed to mount rootfs on nullfs"); scoped_guard(mount_writer) attach_mnt(real_mount(mnt), mp.parent, mp.mp); pr_info("VFS: Finished mounting rootfs on nullfs\n"); /* * We've dropped all locks here but that's fine. Not just are we * the only task that's running, there's no other mount * namespace in existence and the initial mount namespace is * completely empty until we add the mounts we just created. */ for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { mnt_add_to_ns(&init_mnt_ns, p); init_mnt_ns.nr_mounts++; } init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); /* The root and pwd always point to the mutable rootfs. */ root.mnt = mnt; root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) { int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); shmem_init(); init_rootfs(); init_mount_tree(); } void put_mnt_ns(struct mnt_namespace *ns) { if (!ns_ref_put(ns)) return; guard(namespace_excl)(); emptied_ns = ns; guard(mount_writer)(); umount_tree(ns->root, 0); } struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR(mnt)) { mnt_make_shortterm(mnt); synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } EXPORT_SYMBOL(kern_unmount); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { unsigned int i; for (i = 0; i < num; i++) mnt_make_shortterm(mnt[i]); synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); } EXPORT_SYMBOL(kern_unmount_array); bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } bool current_chrooted(void) { /* Does the current process have a non-standard root */ struct path fs_root __free(path_put) = {}; struct mount *root; get_fs_root(current->fs, &fs_root); /* Find the namespace root */ guard(mount_locked_reader)(); root = topmost_overmount(current->nsproxy->mnt_ns->root); return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; struct mount *mnt, *n; guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any * locked child mounts that cover anything except for * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); return true; next: ; } return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; if (ns->user_ns == &init_user_ns) return false; /* Can this filesystem be too revealing? */ s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) { /* * Foreign mounts (accessed via fchdir or through /proc * symlinks) are always treated as if they are nosuid. This * prevents namespaces from trusting potentially unsafe * suid/sgid bits, file caps, or security labels that originate * in other namespaces. */ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && current_in_userns(mnt->mnt_sb->s_user_ns); } static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = &nsproxy->mnt_ns->ns; get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } static void mntns_put(struct ns_common *ns) { put_mnt_ns(to_mnt_ns(ns)); } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(user_ns, CAP_SYS_CHROOT) || !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) return -EINVAL; if (fs->users != 1) return -EINVAL; get_mnt_ns(mnt_ns); old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, "/", LOOKUP_DOWN, &root); if (err) { /* revert to old namespace */ nsproxy->mnt_ns = old_mnt_ns; put_mnt_ns(mnt_ns); return err; } put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); path_put(&root); return 0; } static struct user_namespace *mntns_owner(struct ns_common *ns) { return to_mnt_ns(ns)->user_ns; } const struct proc_ns_operations mntns_operations = { .name = "mnt", .get = mntns_get, .put = mntns_put, .install = mntns_install, .owner = mntns_owner, }; #ifdef CONFIG_SYSCTL static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, }; static int __init init_fs_namespace_sysctls(void) { register_sysctl_init("fs", fs_namespace_sysctls); return 0; } fs_initcall(init_fs_namespace_sysctls); #endif /* CONFIG_SYSCTL */
5614 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 // SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * * This implementation is a fallback for platforms that do not support * I/O TLBs (aka DMA address translation hardware). * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> * Copyright (C) 2000, 2003 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid * unnecessary i-cache flushing. * 04/07/.. ak Better overflow handling. Assorted fixes. * 05/09/10 linville Add support for syncing ranges, support syncing for * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. * 08/12/11 beckyb Add highmem support */ #define pr_fmt(fmt) "software IO TLB: " fmt #include <linux/cache.h> #include <linux/cc_platform.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/io.h> #include <linux/iommu-helper.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <linux/rculist.h> #include <linux/scatterlist.h> #include <linux/set_memory.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/swiotlb.h> #include <linux/types.h> #ifdef CONFIG_DMA_RESTRICTED_POOL #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> #include <linux/slab.h> #endif #define CREATE_TRACE_POINTS #include <trace/events/swiotlb.h> #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* * Minimum IO TLB size to bother booting with. Systems with mainly * 64bit capable cards will only lightly use the swiotlb. If we can't * allocate a contiguous 1MB, we're probably in trouble anyway. */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. * @pad_slots: Number of preceding padding slots. Valid only in the first * allocated non-padding slot. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; unsigned short list; unsigned short pad_slots; }; static bool swiotlb_force_bounce; static bool swiotlb_force_disable; #ifdef CONFIG_SWIOTLB_DYNAMIC static void swiotlb_dyn_alloc(struct work_struct *work); static struct io_tlb_mem io_tlb_default_mem = { .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock), .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools), .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc, swiotlb_dyn_alloc), }; #else /* !CONFIG_SWIOTLB_DYNAMIC */ static struct io_tlb_mem io_tlb_default_mem; #endif /* CONFIG_SWIOTLB_DYNAMIC */ static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static unsigned long default_nareas; /** * struct io_tlb_area - IO TLB memory area descriptor * * This is a single area with a single lock. * * @used: The number of used IO TLB block. * @index: The slot index to start searching in this area for next round. * @lock: The lock to protect the above data structures in the map and * unmap calls. */ struct io_tlb_area { unsigned long used; unsigned int index; spinlock_t lock; }; /* * Round up number of slabs to the next power of 2. The last area is going * be smaller than the rest if default_nslabs is not power of two. * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, * otherwise a segment may span two or more areas. It conflicts with free * contiguous slots tracking: free slots are treated contiguous no matter * whether they cross an area boundary. * * Return true if default_nslabs is rounded up. */ static bool round_up_default_nslabs(void) { if (!default_nareas) return false; if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) default_nslabs = IO_TLB_SEGSIZE * default_nareas; else if (is_power_of_2(default_nslabs)) return false; default_nslabs = roundup_pow_of_two(default_nslabs); return true; } /** * swiotlb_adjust_nareas() - adjust the number of areas and slots * @nareas: Desired number of areas. Zero is treated as 1. * * Adjust the default number of areas in a memory pool. * The default size of the memory pool may also change to meet minimum area * size requirements. */ static void swiotlb_adjust_nareas(unsigned int nareas) { if (!nareas) nareas = 1; else if (!is_power_of_2(nareas)) nareas = roundup_pow_of_two(nareas); default_nareas = nareas; pr_info("area num %d.\n", nareas); if (round_up_default_nslabs()) pr_info("SWIOTLB bounce buffer size roundup to %luMB", (default_nslabs << IO_TLB_SHIFT) >> 20); } /** * limit_nareas() - get the maximum number of areas for a given memory pool size * @nareas: Desired number of areas. * @nslots: Total number of slots in the memory pool. * * Limit the number of areas to the maximum possible number of areas in * a memory pool of the given size. * * Return: Maximum possible number of areas. */ static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots) { if (nslots < nareas * IO_TLB_SEGSIZE) return nslots / IO_TLB_SEGSIZE; return nareas; } static int __init setup_io_tlb_npages(char *str) { if (isdigit(*str)) { /* avoid tail segment of size < IO_TLB_SEGSIZE */ default_nslabs = ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; if (isdigit(*str)) swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); if (*str == ',') ++str; if (!strcmp(str, "force")) swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) swiotlb_force_disable = true; return 0; } early_param("swiotlb", setup_io_tlb_npages); unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; } void __init swiotlb_adjust_size(unsigned long size) { /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); if (round_up_default_nslabs()) size = default_nslabs << IO_TLB_SHIFT; pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; if (!mem->nslabs) { pr_warn("No low mem\n"); return; } pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) { return val & (IO_TLB_SEGSIZE - 1); } static inline unsigned long nr_slots(u64 val) { return DIV_ROUND_UP(val, IO_TLB_SIZE); } /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called * before the SWIOTLB memory is used. */ void __init swiotlb_update_mem_attributes(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long bytes; if (!mem->nslabs || mem->late_alloc) return; bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT); } static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, unsigned long nslabs, bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; mem->late_alloc = late_alloc; mem->nareas = nareas; mem->area_nslabs = nslabs / mem->nareas; for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; mem->areas[i].used = 0; } for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i), mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; mem->slots[i].pad_slots = 0; } memset(vaddr, 0, bytes); mem->vaddr = vaddr; return; } /** * add_mem_pool() - add a memory pool to the allocator * @mem: Software IO TLB allocator. * @pool: Memory pool to be added. */ static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool) { #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock(&mem->lock); list_add_rcu(&pool->node, &mem->pools); mem->nslabs += pool->nslabs; spin_unlock(&mem->lock); #else mem->nslabs = pool->nslabs; #endif } static void __init *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); void *tlb; /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest * memory encryption. */ if (flags & SWIOTLB_ANY) tlb = memblock_alloc(bytes, PAGE_SIZE); else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", __func__, bytes); return NULL; } if (remap && remap(tlb, nslabs) < 0) { memblock_free(tlb, PAGE_ALIGN(bytes)); pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); return NULL; } return tlb; } /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs; unsigned int nareas; size_t alloc_size; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) return; if (swiotlb_force_disable) return; io_tlb_default_mem.force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (flags & SWIOTLB_ANY) io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); else io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT; #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; nareas = limit_nareas(default_nareas, nslabs); while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { if (nslabs <= IO_TLB_MIN_SLABS) return; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); } if (default_nslabs != nslabs) { pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs", default_nslabs, nslabs); default_nslabs = nslabs; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); if (!mem->slots) { pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); return; } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), nareas), SMP_CACHE_BYTES); if (!mem->areas) { pr_warn("%s: Failed to allocate mem->areas.\n", __func__); return; } swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas); add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); } void __init swiotlb_init(bool addressing_limit, unsigned int flags) { swiotlb_init_remap(addressing_limit, flags, NULL); } /* * Systems with larger DMA zones (those that don't support ISA) can * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned int nareas; unsigned char *vstart = NULL; unsigned int order, area_order; bool retried = false; int rc = 0; if (io_tlb_default_mem.nslabs) return 0; if (swiotlb_force_disable) return 0; io_tlb_default_mem.force_bounce = swiotlb_force_bounce; #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) io_tlb_default_mem.phys_limit = zone_dma_limit; else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; order--; nslabs = SLABS_PER_PAGE << order; retried = true; } if (!vstart) return -ENOMEM; if (remap) rc = remap(vstart, nslabs); if (rc) { free_pages((unsigned long)vstart, order); nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); if (nslabs < IO_TLB_MIN_SLABS) return rc; retried = true; goto retry; } if (retried) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); } nareas = limit_nareas(default_nareas, nslabs); area_order = get_order(array_size(sizeof(*mem->areas), nareas)); mem->areas = (struct io_tlb_area *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); if (!mem->areas) goto error_area; mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); if (!mem->slots) goto error_slots; set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true, nareas); add_mem_pool(&io_tlb_default_mem, mem); swiotlb_print_info(); return 0; error_slots: free_pages((unsigned long)mem->areas, area_order); error_area: free_pages((unsigned long)vstart, order); return -ENOMEM; } void __init swiotlb_exit(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long tbl_vaddr; size_t tbl_size, slots_size; unsigned int area_order; if (swiotlb_force_bounce) return; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); tbl_vaddr = (unsigned long)phys_to_virt(mem->start); tbl_size = PAGE_ALIGN(mem->end - mem->start); slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { area_order = get_order(array_size(sizeof(*mem->areas), mem->nareas)); free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { memblock_free_late(__pa(mem->areas), array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } memset(mem, 0, sizeof(*mem)); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * alloc_dma_pages() - allocate pages to be used for DMA * @gfp: GFP flags for the allocation. * @bytes: Size of the buffer. * @phys_limit: Maximum allowed physical address of the buffer. * * Allocate pages from the buddy allocator. If successful, make the allocated * pages decrypted that they can be used for DMA. * * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN) * if the allocated physical address was above @phys_limit. */ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit) { unsigned int order = get_order(bytes); struct page *page; phys_addr_t paddr; void *vaddr; page = alloc_pages(gfp, order); if (!page) return NULL; paddr = page_to_phys(page); if (paddr + bytes - 1 > phys_limit) { __free_pages(page, order); return ERR_PTR(-EAGAIN); } vaddr = phys_to_virt(paddr); if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) goto error; return page; error: /* Intentional leak if pages cannot be encrypted again. */ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) __free_pages(page, order); return NULL; } /** * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer * @dev: Device for which a memory pool is allocated. * @bytes: Size of the buffer. * @phys_limit: Maximum allowed physical address of the buffer. * @gfp: GFP flags for the allocation. * * Return: Allocated pages, or %NULL on allocation failure. */ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, u64 phys_limit, gfp_t gfp) { struct page *page; /* * Allocate from the atomic pools if memory is encrypted and * the allocation is atomic, because decrypting may block. */ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) { void *vaddr; if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) return NULL; return dma_alloc_from_pool(dev, bytes, &vaddr, gfp, dma_coherent_ok); } gfp &= ~GFP_ZONEMASK; if (phys_limit <= zone_dma_limit) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (__GFP_DMA32 | __GFP_DMA))) gfp |= __GFP_DMA32; else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & __GFP_DMA)) gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA; else return NULL; } return page; } /** * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer * @vaddr: Virtual address of the buffer. * @bytes: Size of the buffer. */ static void swiotlb_free_tlb(void *vaddr, size_t bytes) { if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(NULL, vaddr, bytes)) return; /* Intentional leak if pages cannot be encrypted again. */ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) __free_pages(virt_to_page(vaddr), get_order(bytes)); } /** * swiotlb_alloc_pool() - allocate a new IO TLB memory pool * @dev: Device for which a memory pool is allocated. * @minslabs: Minimum number of slabs. * @nslabs: Desired (maximum) number of slabs. * @nareas: Number of areas. * @phys_limit: Maximum DMA buffer physical address. * @gfp: GFP flags for the allocations. * * Allocate and initialize a new IO TLB memory pool. The actual number of * slabs may be reduced if allocation of @nslabs fails. If even * @minslabs cannot be allocated, this function fails. * * Return: New memory pool, or %NULL on allocation failure. */ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, unsigned long minslabs, unsigned long nslabs, unsigned int nareas, u64 phys_limit, gfp_t gfp) { struct io_tlb_pool *pool; unsigned int slot_order; struct page *tlb; size_t pool_size; size_t tlb_size; if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) { nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER; nareas = limit_nareas(nareas, nslabs); } pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); pool = kzalloc(pool_size, gfp); if (!pool) goto error; pool->areas = (void *)pool + sizeof(*pool); tlb_size = nslabs << IO_TLB_SHIFT; while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) { if (nslabs <= minslabs) goto error_tlb; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); tlb_size = nslabs << IO_TLB_SHIFT; } slot_order = get_order(array_size(sizeof(*pool->slots), nslabs)); pool->slots = (struct io_tlb_slot *) __get_free_pages(gfp, slot_order); if (!pool->slots) goto error_slots; swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas); return pool; error_slots: swiotlb_free_tlb(page_address(tlb), tlb_size); error_tlb: kfree(pool); error: return NULL; } /** * swiotlb_dyn_alloc() - dynamic memory pool allocation worker * @work: Pointer to dyn_alloc in struct io_tlb_mem. */ static void swiotlb_dyn_alloc(struct work_struct *work) { struct io_tlb_mem *mem = container_of(work, struct io_tlb_mem, dyn_alloc); struct io_tlb_pool *pool; pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs, default_nareas, mem->phys_limit, GFP_KERNEL); if (!pool) { pr_warn_ratelimited("Failed to allocate new pool"); return; } add_mem_pool(mem, pool); } /** * swiotlb_dyn_free() - RCU callback to free a memory pool * @rcu: RCU head in the corresponding struct io_tlb_pool. */ static void swiotlb_dyn_free(struct rcu_head *rcu) { struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu); size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs); size_t tlb_size = pool->end - pool->start; free_pages((unsigned long)pool->slots, get_order(slots_size)); swiotlb_free_tlb(pool->vaddr, tlb_size); kfree(pool); } /** * __swiotlb_find_pool() - find the IO TLB pool for a physical address * @dev: Device which has mapped the DMA buffer. * @paddr: Physical address within the DMA buffer. * * Find the IO TLB memory pool descriptor which contains the given physical * address, if any. This function is for use only when the dev is known to * be using swiotlb. Use swiotlb_find_pool() for the more general case * when this condition is not met. * * Return: Memory pool which contains @paddr, or %NULL if none. */ struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } pool = NULL; out: rcu_read_unlock(); return pool; } /** * swiotlb_del_pool() - remove an IO TLB pool from a device * @dev: Owning device. * @pool: Memory pool to be removed. */ static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool) { unsigned long flags; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_del_rcu(&pool->node); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); call_rcu(&pool->rcu, swiotlb_dyn_free); } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /** * swiotlb_dev_init() - initialize swiotlb fields in &struct device * @dev: Device to be initialized. */ void swiotlb_dev_init(struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; #ifdef CONFIG_SWIOTLB_DYNAMIC INIT_LIST_HEAD(&dev->dma_io_tlb_pools); spin_lock_init(&dev->dma_io_tlb_lock); dev->dma_uses_io_tlb = false; #endif } /** * swiotlb_align_offset() - Get required offset into an IO TLB allocation. * @dev: Owning device. * @align_mask: Allocation alignment mask. * @addr: DMA address. * * Return the minimum offset from the start of an IO TLB allocation which is * required for a given buffer address and allocation alignment to keep the * device happy. * * First, the address bits covered by min_align_mask must be identical in the * original address and the bounce buffer address. High bits are preserved by * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra * padding bytes before the bounce buffer. * * Second, @align_mask specifies which bits of the first allocated slot must * be zero. This may require allocating additional padding slots, and then the * offset (in bytes) from the first such padding slot is returned. */ static unsigned int swiotlb_align_offset(struct device *dev, unsigned int align_mask, u64 addr) { return addr & dma_get_min_align_mask(dev) & (align_mask | (IO_TLB_SIZE - 1)); } /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir, struct io_tlb_pool *mem) { int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; /* * It's valid for tlb_offset to be negative. This can happen when the * "offset" returned by swiotlb_align_offset() is non-zero, and the * tlb_addr is pointing within the first "offset" bytes of the second * or subsequent slots of the allocated swiotlb area. While it's not * valid for tlb_addr to be pointing within the first "offset" bytes * of the first slot, there's no way to check for such an error since * this function can't distinguish the first slot from the second and * subsequent slots. */ tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - swiotlb_align_offset(dev, 0, orig_addr); orig_addr += tlb_offset; alloc_size -= tlb_offset; if (size > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", alloc_size, size); size = alloc_size; } if (PageHighMem(pfn_to_page(pfn))) { unsigned int offset = orig_addr & ~PAGE_MASK; struct page *page; unsigned int sz = 0; unsigned long flags; while (size) { sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) memcpy_from_page(vaddr, page, offset, sz); else memcpy_to_page(page, offset, vaddr, sz); local_irq_restore(flags); size -= sz; pfn++; vaddr += sz; offset = 0; } } else if (dir == DMA_TO_DEVICE) { memcpy(vaddr, phys_to_virt(orig_addr), size); } else { memcpy(phys_to_virt(orig_addr), vaddr, size); } } static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) { return start + (idx << IO_TLB_SHIFT); } /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ static inline unsigned long get_max_slots(unsigned long boundary_mask) { return (boundary_mask >> IO_TLB_SHIFT) + 1; } static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index) { if (index >= mem->area_nslabs) return 0; return index; } /* * Track the total used slots with a global atomic value in order to have * correct information to determine the high water mark. The mem_used() * function gives imprecise results because there's no locking across * multiple areas. */ #ifdef CONFIG_DEBUG_FS static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { unsigned long old_hiwater, new_used; new_used = atomic_long_add_return(nslots, &mem->total_used); old_hiwater = atomic_long_read(&mem->used_hiwater); do { if (new_used <= old_hiwater) break; } while (!atomic_long_try_cmpxchg(&mem->used_hiwater, &old_hiwater, new_used)); } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { atomic_long_sub(nslots, &mem->total_used); } #else /* !CONFIG_DEBUG_FS */ static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { } #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_SWIOTLB_DYNAMIC #ifdef CONFIG_DEBUG_FS static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots) { atomic_long_add(nslots, &mem->transient_nslabs); } static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots) { atomic_long_sub(nslots, &mem->transient_nslabs); } #else /* !CONFIG_DEBUG_FS */ static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots) { } static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots) { } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SWIOTLB_DYNAMIC */ /** * swiotlb_search_pool_area() - search one memory area in one pool * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @area_index: Index of the IO TLB memory area to be searched. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * * Find a suitable sequence of IO TLB entries for the request and allocate * a buffer from the given IO TLB memory area. * This function takes care of locking. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool, int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { struct io_tlb_area *area = pool->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; unsigned int slot_index; BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); /* * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be * page-aligned in the absence of any other alignment requirements. * 'alloc_align_mask' was later introduced to specify the alignment * explicitly, however this is passed as zero for streaming mappings * and so we preserve the old behaviour there in case any drivers are * relying on it. */ if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE) alloc_align_mask = PAGE_SIZE - 1; /* * Ensure that the allocation is at least slot-aligned and update * 'iotlb_align_mask' to ignore bits that will be preserved when * offsetting into the allocation. */ alloc_align_mask |= (IO_TLB_SIZE - 1); iotlb_align_mask &= ~alloc_align_mask; /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. */ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask)); spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) goto not_found; slot_base = area_index * pool->area_nslabs; index = area->index; for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { phys_addr_t tlb_addr; slot_index = slot_base + index; tlb_addr = slot_addr(tbl_dma_addr, slot_index); if ((tlb_addr & alloc_align_mask) || (orig_addr && (tlb_addr & iotlb_align_mask) != (orig_addr & iotlb_align_mask))) { index = wrap_area_index(pool, index + 1); slots_checked++; continue; } if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { if (pool->slots[slot_index].list >= nslots) goto found; } index = wrap_area_index(pool, index + stride); slots_checked += stride; } not_found: spin_unlock_irqrestore(&area->lock, flags); return -1; found: /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot onwards * and set the list of free entries to '0' indicating unavailable. */ for (i = slot_index; i < slot_index + nslots; i++) { pool->slots[i].list = 0; pool->slots[i].alloc_size = alloc_size - (offset + ((i - slot_index) << IO_TLB_SHIFT)); } for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && pool->slots[i].list; i--) pool->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ area->index = wrap_area_index(pool, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots); return slot_index; } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_search_area() - search one memory area in all pools * @dev: Device which maps the buffer. * @start_cpu: Start CPU number. * @cpu_offset: Offset from @start_cpu. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * @retpool: Used memory pool, updated on return. * * Search one memory area in all pools for a sequence of slots that match the * allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_search_area(struct device *dev, int start_cpu, int cpu_offset, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; int area_index; int index = -1; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { if (cpu_offset >= pool->nareas) continue; area_index = (start_cpu + cpu_offset) & (pool->nareas - 1); index = swiotlb_search_pool_area(dev, pool, area_index, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) { *retpool = pool; break; } } rcu_read_unlock(); return index; } /** * swiotlb_find_slots() - search for slots in the whole swiotlb * @dev: Device which maps the buffer. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * @retpool: Used memory pool, updated on return. * * Search through the whole software IO TLB to find a sequence of slots that * match the allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; unsigned long nslabs; unsigned long flags; u64 phys_limit; int cpu, i; int index; if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE) return -1; cpu = raw_smp_processor_id(); for (i = 0; i < default_nareas; ++i) { index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size, alloc_align_mask, &pool); if (index >= 0) goto found; } if (!mem->can_grow) return -1; schedule_work(&mem->dyn_alloc); nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, GFP_NOWAIT); if (!pool) return -1; index = swiotlb_search_pool_area(dev, pool, 0, orig_addr, alloc_size, alloc_align_mask); if (index < 0) { swiotlb_dyn_free(&pool->rcu); return -1; } pool->transient = true; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_add_rcu(&pool->node, &dev->dma_io_tlb_pools); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); inc_transient_used(mem, pool->nslabs); found: WRITE_ONCE(dev->dma_uses_io_tlb, true); /* * The general barrier orders reads and writes against a presumed store * of the SWIOTLB buffer address by a device driver (to a driver private * data structure). It serves two purposes. * * First, the store to dev->dma_uses_io_tlb must be ordered before the * presumed store. This guarantees that the returned buffer address * cannot be passed to another CPU before updating dev->dma_uses_io_tlb. * * Second, the load from mem->pools must be ordered before the same * presumed store. This guarantees that the returned buffer address * cannot be observed by another CPU before an update of the RCU list * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy * atomicity). * * See also the comment in swiotlb_find_pool(). */ smp_mb(); *retpool = pool; return index; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { struct io_tlb_pool *pool; int start, i; int index; *retpool = pool = &dev->dma_io_tlb_mem->defpool; i = start = raw_smp_processor_id() & (pool->nareas - 1); do { index = swiotlb_search_pool_area(dev, pool, i, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) return index; if (++i >= pool->nareas) i = 0; } while (i != start); return -1; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ #ifdef CONFIG_DEBUG_FS /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is accurate in this version of the function, because an atomic * counter is available if CONFIG_DEBUG_FS is set. * * Return: Number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { return atomic_long_read(&mem->total_used); } #else /* !CONFIG_DEBUG_FS */ /** * mem_pool_used() - get number of used slots in a memory pool * @pool: Software IO TLB memory pool. * * The result is not accurate, see mem_used(). * * Return: Approximate number of used slots. */ static unsigned long mem_pool_used(struct io_tlb_pool *pool) { int i; unsigned long used = 0; for (i = 0; i < pool->nareas; i++) used += pool->areas[i].used; return used; } /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is not accurate, because there is no locking of individual * areas. * * Return: Approximate number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { #ifdef CONFIG_SWIOTLB_DYNAMIC struct io_tlb_pool *pool; unsigned long used = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) used += mem_pool_used(pool); rcu_read_unlock(); return used; #else return mem_pool_used(&mem->defpool); #endif } #endif /* CONFIG_DEBUG_FS */ /** * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area * @dev: Device which maps the buffer. * @orig_addr: Original (non-bounced) physical IO buffer address * @mapping_size: Requested size of the actual bounce buffer, excluding * any pre- or post-padding for alignment * @alloc_align_mask: Required start and end alignment of the allocated buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * * Find and allocate a suitable sequence of IO TLB slots for the request. * The allocated space starts at an alignment specified by alloc_align_mask, * and the size of the allocated space is rounded up so that the total amount * of allocated space is a multiple of (alloc_align_mask + 1). If * alloc_align_mask is zero, the allocated space may be at any alignment and * the size is not rounded up. * * The returned address is within the allocated space and matches the bits * of orig_addr that are specified in the DMA min_align_mask for the device. As * such, this returned address may be offset from the beginning of the allocated * space. The bounce buffer space starting at the returned address for * mapping_size bytes is initialized to the contents of the original IO buffer * area. Any pre-padding (due to an offset) and any post-padding (due to * rounding-up the size) is not initialized. */ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, unsigned int alloc_align_mask, enum dma_data_direction dir, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset; struct io_tlb_pool *pool; unsigned int i; size_t size; int index; phys_addr_t tlb_addr; unsigned short pad_slots; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); return (phys_addr_t)DMA_MAPPING_ERROR; } if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); /* * The default swiotlb memory pool is allocated with PAGE_SIZE * alignment. If a mapping is requested with larger alignment, * the mapping may be unable to use the initial slot(s) in all * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request * of or near the maximum mapping size would always fail. */ dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK, "Alloc alignment may prevent fulfilling requests with max mapping_size\n"); offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr); size = ALIGN(mapping_size + offset, alloc_align_mask + 1); index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } /* * If dma_skip_sync was set, reset it on first SWIOTLB buffer * mapping to always sync SWIOTLB buffers. */ dma_reset_need_sync(dev); /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ pad_slots = offset >> IO_TLB_SHIFT; offset &= (IO_TLB_SIZE - 1); index += pad_slots; pool->slots[index].pad_slots = pad_slots; for (i = 0; i < (nr_slots(size) - pad_slots); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; /* * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy * the original buffer to the TLB buffer before initiating DMA in order * to preserve the original's data if the device does a partial write, * i.e. if the device doesn't overwrite the entire buffer. Preserving * the original data, even if it's garbage, is necessary to match * hardware behavior. Use of swiotlb is supposed to be transparent, * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes. */ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool); return tlb_addr; } static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr, struct io_tlb_pool *mem) { unsigned long flags; unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr); int index, nslots, aindex; struct io_tlb_area *area; int count, i; index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; index -= mem->slots[index].pad_slots; nslots = nr_slots(mem->slots[index].alloc_size + offset); aindex = index / mem->area_nslabs; area = &mem->areas[aindex]; /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ BUG_ON(aindex >= mem->nareas); spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else count = 0; /* * Step 1: return the slots to the free list, merging the slots with * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; mem->slots[i].pad_slots = 0; } /* * Step 2: merge the returned slots with the preceding slots, if * available (non zero) */ for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); dec_used(dev->dma_io_tlb_mem, nslots); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_del_transient() - delete a transient memory pool * @dev: Device which mapped the buffer. * @tlb_addr: Physical address within a bounce buffer. * @pool: Pointer to the transient memory pool to be checked and deleted. * * Check whether the address belongs to a transient SWIOTLB memory pool. * If yes, then delete the pool. * * Return: %true if @tlb_addr belonged to a transient pool that was released. */ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr, struct io_tlb_pool *pool) { if (!pool->transient) return false; dec_used(dev->dma_io_tlb_mem, pool->nslabs); swiotlb_del_pool(dev, pool); dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs); return true; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static inline bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr, struct io_tlb_pool *pool) { return false; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /* * tlb_addr is the physical address of the bounce buffer to unmap. */ void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs, struct io_tlb_pool *pool) { /* * First, sync the memory before unmapping the entry */ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE, pool); if (swiotlb_del_transient(dev, tlb_addr, pool)) return; swiotlb_release_slots(dev, tlb_addr, pool); } void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir, struct io_tlb_pool *pool) { if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool); else BUG_ON(dir != DMA_FROM_DEVICE); } void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir, struct io_tlb_pool *pool) { if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool); else BUG_ON(dir != DMA_TO_DEVICE); } /* * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t swiotlb_addr; dma_addr_t dma_addr; trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC, swiotlb_find_pool(dev, swiotlb_addr)); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(swiotlb_addr, size, dir); return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) { int min_align_mask = dma_get_min_align_mask(dev); int min_align = 0; /* * swiotlb_find_slots() skips slots according to * min align mask. This affects max mapping size. * Take it into acount here. */ if (min_align_mask) min_align = roundup(min_align_mask, IO_TLB_SIZE); return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } /** * is_swiotlb_allocated() - check if the default software IO TLB is initialized */ bool is_swiotlb_allocated(void) { return io_tlb_default_mem.nslabs; } bool is_swiotlb_active(struct device *dev) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && mem->nslabs; } /** * default_swiotlb_base() - get the base address of the default SWIOTLB * * Get the lowest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_base(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC io_tlb_default_mem.can_grow = false; #endif return io_tlb_default_mem.defpool.start; } /** * default_swiotlb_limit() - get the address limit of the default SWIOTLB * * Get the highest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_limit(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC return io_tlb_default_mem.phys_limit; #else return io_tlb_default_mem.defpool.end - 1; #endif } #ifdef CONFIG_DEBUG_FS #ifdef CONFIG_SWIOTLB_DYNAMIC static unsigned long mem_transient_used(struct io_tlb_mem *mem) { return atomic_long_read(&mem->transient_nslabs); } static int io_tlb_transient_used_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = mem_transient_used(mem); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get, NULL, "%llu\n"); #endif /* CONFIG_SWIOTLB_DYNAMIC */ static int io_tlb_used_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = mem_used(mem); return 0; } static int io_tlb_hiwater_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = atomic_long_read(&mem->used_hiwater); return 0; } static int io_tlb_hiwater_set(void *data, u64 val) { struct io_tlb_mem *mem = data; /* Only allow setting to zero */ if (val != 0) return -EINVAL; atomic_long_set(&mem->used_hiwater, val); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, io_tlb_hiwater_set, "%llu\n"); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); #ifdef CONFIG_SWIOTLB_DYNAMIC debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs, mem, &fops_io_tlb_transient_used); #endif } static int __init swiotlb_create_default_debugfs(void) { swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } late_initcall(swiotlb_create_default_debugfs); #else /* !CONFIG_DEBUG_FS */ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { } #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DMA_RESTRICTED_POOL struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; phys_addr_t tlb_addr; unsigned int align; int index; if (!mem) return NULL; align = (1 << (get_order(size) + PAGE_SHIFT)) - 1; index = swiotlb_find_slots(dev, 0, size, align, &pool); if (index == -1) return NULL; tlb_addr = slot_addr(pool->start, index); if (unlikely(!PAGE_ALIGNED(tlb_addr))) { dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n", &tlb_addr); swiotlb_release_slots(dev, tlb_addr, pool); return NULL; } return pfn_to_page(PFN_DOWN(tlb_addr)); } bool swiotlb_free(struct device *dev, struct page *page, size_t size) { phys_addr_t tlb_addr = page_to_phys(page); struct io_tlb_pool *pool; pool = swiotlb_find_pool(dev, tlb_addr); if (!pool) return false; swiotlb_release_slots(dev, tlb_addr, pool); return true; } static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct device *dev) { struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; /* Set Per-device io tlb area to one */ unsigned int nareas = 1; if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping."); return -EINVAL; } /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached * to it. */ if (!mem) { struct io_tlb_pool *pool; mem = kzalloc_obj(*mem); if (!mem) return -ENOMEM; pool = &mem->defpool; pool->slots = kzalloc_objs(*pool->slots, nslabs); if (!pool->slots) { kfree(mem); return -ENOMEM; } pool->areas = kzalloc_objs(*pool->areas, nareas); if (!pool->areas) { kfree(pool->slots); kfree(mem); return -ENOMEM; } set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs, false, nareas); mem->force_bounce = true; mem->for_alloc = true; #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock_init(&mem->lock); INIT_LIST_HEAD_RCU(&mem->pools); #endif add_mem_pool(mem, pool); rmem->priv = mem; swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; return 0; } static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { .device_init = rmem_swiotlb_device_init, .device_release = rmem_swiotlb_device_release, }; static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) { unsigned long node = rmem->fdt_node; if (of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "linux,cma-default", NULL) || of_get_flat_dt_prop(node, "linux,dma-default", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */
6 6 6 6 1 3 2 5 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2022 Intel Corporation * Copyright 2023-2024 NXP */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/iso.h> #include "eir.h" static const struct proto_ops iso_sock_ops; static struct bt_sock_list iso_sk_list = { .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock) }; /* ---- ISO connections ---- */ struct iso_conn { struct hci_conn *hcon; /* @lock: spinlock protecting changes to iso_conn fields */ spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; struct sk_buff *rx_skb; __u32 rx_len; __u16 tx_sn; struct kref ref; }; #define iso_conn_lock(c) spin_lock(&(c)->lock) #define iso_conn_unlock(c) spin_unlock(&(c)->lock) static void iso_sock_close(struct sock *sk); static void iso_sock_kill(struct sock *sk); /* ----- ISO socket info ----- */ #define iso_pi(sk) ((struct iso_pinfo *)sk) #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) #define EIR_BAA_SERVICE_UUID 0x1851 /* iso_pinfo flags values */ enum { BT_SK_BIG_SYNC, BT_SK_PA_SYNC, }; struct iso_pinfo { struct bt_sock bt; bdaddr_t src; __u8 src_type; bdaddr_t dst; __u8 dst_type; __u8 bc_sid; __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; __u8 base[BASE_MAX_LENGTH]; struct iso_conn *conn; }; static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); static bool iso_match_sid_past(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT secs_to_jiffies(20) #define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { struct iso_conn *conn = container_of(ref, struct iso_conn, ref); BT_DBG("conn %p", conn); if (conn->sk) iso_pi(conn->sk)->conn = NULL; if (conn->hcon) { conn->hcon->iso_data = NULL; hci_conn_drop(conn->hcon); } /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); kfree_skb(conn->rx_skb); kfree(conn); } static void iso_conn_put(struct iso_conn *conn) { if (!conn) return; BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); kref_put(&conn->ref, iso_conn_free); } static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn) { if (!conn) return NULL; BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); if (!kref_get_unless_zero(&conn->ref)) return NULL; return conn; } static struct sock *iso_sock_hold(struct iso_conn *conn) { if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk)) return NULL; sock_hold(conn->sk); return conn->sk; } static void iso_sock_timeout(struct work_struct *work) { struct iso_conn *conn = container_of(work, struct iso_conn, timeout_work.work); struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void iso_sock_set_timer(struct sock *sk, long timeout) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout); } static void iso_sock_clear_timer(struct sock *sk) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); } /* ---- ISO connections ---- */ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; conn = iso_conn_hold_unless_zero(conn); if (conn) { if (!conn->hcon) { iso_conn_lock(conn); conn->hcon = hcon; iso_conn_unlock(conn); } iso_conn_put(conn); return conn; } conn = kzalloc_obj(*conn); if (!conn) return NULL; kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout); hcon->iso_data = conn; conn->hcon = hcon; conn->tx_sn = 0; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. Must be called on the locked socket. */ static void iso_chan_del(struct sock *sk, int err) { struct iso_conn *conn; struct sock *parent; conn = iso_pi(sk)->conn; iso_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { iso_conn_lock(conn); conn->sk = NULL; iso_conn_unlock(conn); iso_conn_put(conn); } sk->sk_state = BT_CLOSED; sk->sk_err = err; parent = bt_sk(sk)->parent; if (parent) { bt_accept_unlink(sk); parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } sock_set_flag(sk, SOCK_ZAPPED); } static void iso_conn_del(struct hci_conn *hcon, int err) { struct iso_conn *conn = hcon->iso_data; struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) { iso_conn_put(conn); return; } lock_sock(sk); iso_sock_clear_timer(sk); iso_chan_del(sk, err); release_sock(sk); sock_put(sk); } static int __iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); if (iso_pi(sk)->conn == conn && conn->sk == sk) return 0; if (conn->sk) { BT_ERR("conn->sk already set"); return -EBUSY; } iso_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); return 0; } static int iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { int err; iso_conn_lock(conn); err = __iso_chan_add(conn, sk, parent); iso_conn_unlock(conn); return err; } static inline u8 le_addr_type(u8 bdaddr_type) { if (bdaddr_type == BDADDR_LE_PUBLIC) return ADDR_LE_DEV_PUBLIC; else return ADDR_LE_DEV_RANDOM; } static int iso_connect_bis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.bcast.out.phys) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } /* Update SID if it was not set */ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) iso_pi(sk)->bc_sid = hcon->sid; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_connect_cis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.ucast.in.phys && !iso_pi(sk)->qos.ucast.out.phys) { err = -EINVAL; goto unlock; } /* Check if there are available buffers for output/TX. */ if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) && (hdev->iso_pkts && !hdev->iso_cnt)) { err = -ENOBUFS; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) { if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) return &iso_pi(sk)->conn->hcon->iso_qos; return &iso_pi(sk)->qos; } static int iso_send_frame(struct sock *sk, struct sk_buff *skb, const struct sockcm_cookie *sockc) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); struct hci_iso_data_hdr *hdr; int len = 0; BT_DBG("sk %p len %d", sk, skb->len); if (skb->len > qos->ucast.out.sdu) return -EMSGSIZE; len = skb->len; /* Push ISO data header */ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE); hdr->sn = cpu_to_le16(conn->tx_sn++); hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); if (sk->sk_state == BT_CONNECTED) { hci_setup_tx_timestamp(skb, 1, sockc); hci_send_iso(conn->hcon, skb); } else { len = -ENOTCONN; } return len; } static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) { struct sock *sk; iso_conn_lock(conn); sk = conn->sk; iso_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->dst, dst)) continue; if (!bacmp(&iso_pi(sk)->src, src)) return sk; } return NULL; } static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, __u8 sid) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->src, ba)) continue; if (bacmp(&iso_pi(sk)->dst, bc)) continue; if (iso_pi(sk)->bc_sid == sid) return sk; } return NULL; } /* Find socket in given state: * source bdaddr (Unicast) * destination bdaddr (Broadcast only) * match func - pass NULL to ignore * match func data - pass -1 to ignore * Returns closest match. */ static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != state) continue; /* Match Broadcast destination */ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) { struct smp_irk *irk1, *irk2; /* Check if destination is an RPA that we can resolve */ irk1 = hci_find_irk_by_rpa(hdev, dst); if (!irk1) continue; /* Match with identity address */ if (bacmp(&iso_pi(sk)->dst, &irk1->bdaddr)) { /* Check if socket destination address is also * an RPA and if the IRK matches. */ irk2 = hci_find_irk_by_rpa(hdev, &iso_pi(sk)->dst); if (!irk2 || irk1 != irk2) continue; } } /* Use Match function if provided */ if (match && !match(sk, data)) continue; /* Exact match. */ if (!bacmp(&iso_pi(sk)->src, src)) { sock_hold(sk); break; } /* Closest match */ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) { if (sk1) sock_put(sk1); sk1 = sk; sock_hold(sk1); } } if (sk && sk1) sock_put(sk1); read_unlock(&iso_sk_list.lock); return sk ? sk : sk1; } static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src, bdaddr_t *dst, uint8_t big) { struct sock *sk = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (match_sk == sk) continue; /* Look for sockets that have already been * connected to the BIG */ if (sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT) continue; /* Match Broadcast destination */ if (bacmp(&iso_pi(sk)->dst, dst)) continue; /* Match BIG handle */ if (iso_pi(sk)->qos.bcast.big != big) continue; /* Match source address */ if (bacmp(&iso_pi(sk)->src, src)) continue; sock_hold(sk); break; } read_unlock(&iso_sk_list.lock); return sk; } static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); iso_conn_put(iso_pi(sk)->conn); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&sk->sk_error_queue); } static void iso_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); } /* If listening socket has a hcon, properly disconnect it */ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) { iso_sock_disconn(parent); return; } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void iso_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || sock_flag(sk, SOCK_DEAD)) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ if (iso_pi(sk)->conn) { iso_conn_lock(iso_pi(sk)->conn); iso_pi(sk)->conn->sk = NULL; iso_conn_unlock(iso_pi(sk)->conn); } /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void iso_sock_disconn(struct sock *sk) { struct sock *bis_sk; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) { bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->qos.bcast.big); /* If there are any other connected sockets for the * same BIG, just delete the sk and leave the bis * hcon active, in case later rebinding is needed. */ if (bis_sk) { hcon->state = BT_OPEN; hcon->iso_data = NULL; iso_pi(sk)->conn->hcon = NULL; iso_sock_clear_timer(sk); iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); sock_put(bis_sk); return; } } sk->sk_state = BT_DISCONN; iso_conn_lock(iso_pi(sk)->conn); hci_conn_drop(iso_pi(sk)->conn->hcon); iso_pi(sk)->conn->hcon = NULL; iso_conn_unlock(iso_pi(sk)->conn); } static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: iso_sock_cleanup_listen(sk); break; case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_CONNECT2: if (iso_pi(sk)->conn->hcon && (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags))) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: iso_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { iso_sock_clear_timer(sk); lock_sock(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); } static void iso_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto iso_proto = { .name = "ISO", .owner = THIS_MODULE, .obj_size = sizeof(struct iso_pinfo) }; #define DEFAULT_IO_QOS \ { \ .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ .phys = BT_ISO_PHY_2M, \ .rtn = 2u, \ } static struct bt_iso_qos default_qos = { .bcast = { .big = BT_ISO_QOS_BIG_UNSET, .bis = BT_ISO_QOS_BIS_UNSET, .sync_factor = 0x01, .packing = 0x00, .framing = 0x00, .in = DEFAULT_IO_QOS, .out = DEFAULT_IO_QOS, .encryption = 0x00, .bcode = {0x00}, .options = 0x00, .skip = 0x0000, .sync_timeout = BT_ISO_SYNC_TIMEOUT, .sync_cte_type = 0x00, .mse = 0x00, .timeout = BT_ISO_SYNC_TIMEOUT, }, }; static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; iso_pi(sk)->qos = default_qos; iso_pi(sk)->sync_handle = -1; bt_sock_link(&iso_sk_list, sk); return sk; } static int iso_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &iso_sock_ops; sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; iso_sock_init(sk, NULL); return 0; } static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int i; BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid, sa->iso_bc->bc_num_bis); if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr); /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) return -EINVAL; iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) return -EINVAL; iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) return -EINVAL; memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); return 0; } /* Must be called on the locked socket. */ static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) return -EBADFD; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; goto done; } iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) { err = -EINVAL; goto done; } memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); done: return err; } static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn) { struct hci_dev *hdev = NULL; iso_conn_lock(conn); if (conn->hcon) hdev = hci_dev_hold(conn->hcon->hdev); iso_conn_unlock(conn); return hdev; } /* Must be called on the locked socket. */ static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { struct hci_dev *hdev; struct hci_conn *bis; int err; if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn) return -EINVAL; /* Check if it is really a Broadcast address being requested */ if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; /* Check if the address hasn't changed then perhaps only the number of * bis has changed. */ if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) || !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY)) return iso_sock_rebind_bis(sk, sa, addr_len); /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) return -EINVAL; hdev = iso_conn_get_hdev(iso_pi(sk)->conn); if (!hdev) return -EINVAL; bis = iso_pi(sk)->conn->hcon; /* Release the socket before lookups since that requires hci_dev_lock * which shall not be acquired while holding sock_lock for proper * ordering. */ release_sock(sk); hci_dev_lock(bis->hdev); lock_sock(sk); if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) { /* raced with iso_conn_del() or iso_disconn_sock() */ err = -ENOTCONN; goto unlock; } BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr, sa->iso_bc->bc_bdaddr_type); err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr, le_addr_type(sa->iso_bc->bc_bdaddr_type)); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type); if (!addr || addr_len < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) && addr_len > sizeof(*sa)) { /* Allow the user to rebind to a different address using * PAST procedures. */ err = iso_sock_rebind_bc(sk, sa, addr_len); goto done; } if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) { err = -EINVAL; goto done; } bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr); iso_pi(sk)->src_type = sa->iso_bdaddr_type; /* Check for Broadcast address */ if (addr_len > sizeof(*sa)) { err = iso_sock_bind_bc(sock, addr, addr_len); if (err) goto done; } sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) return -EINVAL; lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) return err; lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } release_sock(sk); return err; } static int iso_listen_bis(struct sock *sk) { struct hci_dev *hdev; int err = 0; struct iso_conn *conn; struct hci_conn *hcon; BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); if (err) return err; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); lock_sock(sk); /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } err = iso_chan_add(conn, sk, NULL); if (err) { hci_conn_drop(hcon); goto unlock; } unlock: release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_listen_cis(struct sock *sk) { int err = 0; BT_DBG("%pMR", &iso_pi(sk)->src); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); return err; } static int iso_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); sock_hold(sk); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { err = iso_listen_cis(sk); } else { /* Drop sock lock to avoid potential * deadlock with the hdev lock. */ release_sock(sk); err = iso_listen_bis(sk); lock_sock(sk); } if (err) goto done; sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; done: release_sock(sk); sock_put(sk); return err; } static int iso_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; /* Use explicit nested locking to avoid lockdep warnings generated * because the parent socket and the child socket are locked on the * same thread. */ lock_sock_nested(sk, SINGLE_DEPTH_NESTING); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); /* A Broadcast Sink might require BIG sync to be terminated * and re-established multiple times, while keeping the same * PA sync handle active. To allow this, once all BIS * connections have been accepted on a PA sync parent socket, * "reset" socket state, to allow future BIG re-sync procedures. */ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { /* Iterate through the list of bound BIS indices * and clear each BIS as they are accepted by the * user space, one by one. */ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) { if (iso_pi(sk)->bc_bis[i] > 0) { iso_pi(sk)->bc_bis[i] = 0; iso_pi(sk)->bc_num_bis--; break; } } if (iso_pi(sk)->bc_num_bis == 0) { /* Once the last BIS was accepted, reset parent * socket parameters to mark that the listening * process for BIS connections has been completed: * * 1. Reset the DEFER setup flag on the parent sk. * 2. Clear the flag marking that the BIG create * sync command is pending. * 3. Transition socket state from BT_LISTEN to * BT_CONNECTED. */ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); clear_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags); sk->sk_state = BT_CONNECTED; } } done: release_sock(sk); return err; } static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int len = sizeof(struct sockaddr_iso); BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) { struct hci_conn *hcon = iso_pi(sk)->conn ? iso_pi(sk)->conn->hcon : NULL; bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) { sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, ISO_MAX_NUM_BIS); len += sizeof(struct sockaddr_iso_bc); } } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } return len; } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; struct sockcm_cookie sockc; size_t mtu; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hci_sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (err) return err; } lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { release_sock(sk); return -ENOTCONN; } mtu = iso_pi(sk)->conn->hcon->mtu; release_sock(sk); skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0); if (IS_ERR(skb)) return PTR_ERR(skb); len -= skb->len; BT_DBG("skb %p len %d", sk, skb->len); /* Continuation fragments */ frag = &skb_shinfo(skb)->frag_list; while (len) { struct sk_buff *tmp; tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0); if (IS_ERR(tmp)) { kfree_skb(skb); return PTR_ERR(tmp); } *frag = tmp; len -= tmp->len; skb->len += tmp->len; skb->data_len += tmp->len; BT_DBG("frag %p len %d", *frag, tmp->len); frag = &(*frag)->next; } lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = iso_send_frame(sk, skb, &sockc); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void iso_conn_defer_accept(struct hci_conn *conn) { struct hci_cp_le_accept_cis cp; struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp); } static void iso_conn_big_sync(struct sock *sk) { int err; struct hci_dev *hdev; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return; /* hci_le_big_create_sync requires hdev lock to be held, since * it enqueues the HCI LE BIG Create Sync command via * hci_cmd_sync_queue_once, which checks hdev flags that might * change. */ hci_dev_lock(hdev); lock_sock(sk); if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_conn_big_create_sync(hdev, iso_pi(sk)->conn->hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) bt_dev_err(hdev, "hci_big_create_sync: %d", err); } release_sock(sk); hci_dev_unlock(hdev); } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); bool early_ret = false; int err = 0; BT_DBG("sk %p", sk); if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, BT_SCM_ERROR); if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sock_hold(sk); lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: if (test_bit(BT_SK_PA_SYNC, &pi->flags)) { release_sock(sk); iso_conn_big_sync(sk); lock_sock(sk); sk->sk_state = BT_LISTEN; } else { iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; } early_ret = true; break; case BT_CONNECTED: if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { release_sock(sk); iso_conn_big_sync(sk); lock_sock(sk); sk->sk_state = BT_LISTEN; early_ret = true; } break; case BT_CONNECT: release_sock(sk); err = iso_connect_cis(sk); lock_sock(sk); early_ret = true; break; default: break; } release_sock(sk); sock_put(sk); if (early_ret) return err; } return bt_sock_recvmsg(sock, msg, len, flags); } static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ if (!qos->phys && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) return false; if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; if (qos->phys > BT_ISO_PHY_ANY) return false; return true; } static bool check_ucast_qos(struct bt_iso_qos *qos) { if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) return false; if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) return false; if (qos->ucast.sca > 0x07) return false; if (qos->ucast.packing > 0x01) return false; if (qos->ucast.framing > 0x01) return false; if (!check_io_qos(&qos->ucast.in)) return false; if (!check_io_qos(&qos->ucast.out)) return false; return true; } static bool check_bcast_qos(struct bt_iso_qos *qos) { if (!qos->bcast.sync_factor) qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; if (qos->bcast.framing > 0x01) return false; if (!check_io_qos(&qos->bcast.in)) return false; if (!check_io_qos(&qos->bcast.out)) return false; if (qos->bcast.encryption > 0x01) return false; if (qos->bcast.options > 0x07) return false; if (qos->bcast.skip > 0x01f3) return false; if (!qos->bcast.sync_timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; if (qos->bcast.sync_cte_type > 0x1f) return false; if (qos->bcast.mse > 0x1f) return false; if (!qos->bcast.timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; return true; } static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_PKT_STATUS: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_PKT_SEQNUM: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); break; case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2 && (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) || sk->sk_state != BT_CONNECTED)) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&qos, sizeof(qos), optval, optlen); if (err) break; iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; break; case BT_ISO_BASE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } if (optlen > sizeof(iso_pi(sk)->base)) { err = -EINVAL; break; } err = copy_safe_from_sockptr(iso_pi(sk)->base, optlen, optval, optlen); if (err) break; iso_pi(sk)->base_len = optlen; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_iso_qos *qos; u8 base_len; u8 *base; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_ISO_QOS: qos = iso_sock_get_qos(sk); len = min_t(unsigned int, len, sizeof(*qos)); if (copy_to_user(optval, qos, len)) err = -EFAULT; break; case BT_ISO_BASE: if (sk->sk_state == BT_CONNECTED && !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len; base = iso_pi(sk)->conn->hcon->le_per_adv_data; } else { base_len = iso_pi(sk)->base_len; base = iso_pi(sk)->base; } len = min_t(unsigned int, len, base_len); if (copy_to_user(optval, base, len)) err = -EFAULT; if (put_user(len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; sock_hold(sk); lock_sock(sk); switch (how) { case SHUT_RD: if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; sk->sk_shutdown |= RCV_SHUTDOWN; break; case SHUT_WR: if (sk->sk_shutdown & SEND_SHUTDOWN) goto unlock; sk->sk_shutdown |= SEND_SHUTDOWN; break; case SHUT_RDWR: if (sk->sk_shutdown & SHUTDOWN_MASK) goto unlock; sk->sk_shutdown |= SHUTDOWN_MASK; break; } iso_sock_clear_timer(sk); __iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); unlock: release_sock(sk); sock_put(sk); return err; } static int iso_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); iso_sock_kill(sk); return err; } static void iso_sock_ready(struct sock *sk) { BT_DBG("sk %p", sk); if (!sk) return; lock_sock(sk); iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } static bool iso_match_big(struct sock *sk, void *data) { struct hci_evt_le_big_sync_established *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } static bool iso_match_big_hcon(struct sock *sk, void *data) { struct hci_conn *hcon = data; return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big; } static bool iso_match_pa_sync_flag(struct sock *sk, void *data) { return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } static bool iso_match_dst(struct sock *sk, void *data) { return !bacmp(&iso_pi(sk)->dst, (bdaddr_t *)data); } static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; struct hci_ev_le_big_sync_established *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; struct hci_dev *hdev; BT_DBG("conn %p", conn); if (sk) { /* Attempt to update source address in case of BIS Sender if * the advertisement is using a random address. */ if (conn->hcon->type == BIS_LINK && conn->hcon->role == HCI_ROLE_MASTER && !bacmp(&conn->hcon->dst, BDADDR_ANY)) { struct hci_conn *bis = conn->hcon; struct adv_info *adv; adv = hci_find_adv_instance(bis->hdev, bis->iso_qos.bcast.bis); if (adv && bacmp(&adv->random_addr, BDADDR_ANY)) { lock_sock(sk); iso_pi(sk)->src_type = BDADDR_LE_RANDOM; bacpy(&iso_pi(sk)->src, &adv->random_addr); release_sock(sk); } } iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; hdev = hcon->hdev; if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { /* A BIS slave hcon is notified to the ISO layer * after the Command Complete for the LE Setup * ISO Data Path command is received. Get the * parent socket that matches the hcon BIG handle. */ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big_hcon, hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABLISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_pa_sync_flag, NULL); if (!parent && ev) parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big, ev); } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sid, ev2); } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { ev3 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); } if (!parent) parent = iso_get_sock(hdev, &hcon->src, BDADDR_ANY, BT_LISTEN, iso_match_dst, BDADDR_ANY); if (!parent) return; lock_sock(parent); sk = iso_sock_alloc(sock_net(parent), NULL, BTPROTO_ISO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return; } iso_sock_init(sk, parent); bacpy(&iso_pi(sk)->src, &hcon->src); /* Convert from HCI to three-value type */ if (hcon->src_type == ADDR_LE_DEV_PUBLIC) iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; else iso_pi(sk)->src_type = BDADDR_LE_RANDOM; /* If hcon has no destination address (BDADDR_ANY) it means it * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using * the parent socket destination address. */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = le_addr_type(iso_pi(parent)->dst_type); } if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { iso_pi(sk)->qos = iso_pi(parent)->qos; hcon->iso_qos = iso_pi(sk)->qos; iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } bacpy(&iso_pi(sk)->dst, &hcon->dst); /* Convert from HCI to three-value type */ if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) iso_pi(sk)->dst_type = BDADDR_LE_PUBLIC; else iso_pi(sk)->dst_type = BDADDR_LE_RANDOM; iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; sk->sk_error_report(sk); } if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sock_put(parent); } } static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) return true; return ev->sid == iso_pi(sk)->bc_sid; } static bool iso_match_sid_past(struct sock *sk, void *data) { struct hci_ev_le_past_received *ev = data; if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) return true; return ev->sid == iso_pi(sk)->bc_sid; } static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) { struct hci_ev_le_per_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } /* ----- ISO interface with lower layer (HCI) ----- */ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; struct hci_ev_le_past_received *ev1a; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); /* Broadcast receiver requires handling of some events before it can * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific * SID to listen to and once sync is established its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored * in iso_pi(sk)->base so it can be passed up to user, in the case of a * broadcast sink. */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); iso_pi(sk)->bc_sid = ev1->sid; } goto done; } ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED); if (ev1a) { sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid_past, ev1a); if (sk && !ev1a->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); iso_pi(sk)->bc_sid = ev1a->sid; } goto done; } ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Check if BIGInfo report has already been handled */ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECTED, iso_match_sync_handle, ev2); if (sk) { sock_put(sk); sk = NULL; goto done; } /* Try to get PA sync socket, if it exists */ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECT2, iso_match_sync_handle, ev2); if (!sk) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle, ev2); if (sk) { int err; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_conn_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); sock_put(sk); sk = NULL; } } } goto done; } ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) { size_t base_len = 0; u8 *base; struct hci_conn *hcon; sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; hcon = iso_pi(sk)->conn->hcon; if (!hcon) goto done; if (ev3->data_status == LE_PA_DATA_TRUNCATED) { /* The controller was unable to retrieve PA data. */ memset(hcon->le_per_adv_data, 0, HCI_MAX_PER_AD_TOT_LEN); hcon->le_per_adv_data_len = 0; hcon->le_per_adv_data_offset = 0; goto done; } if (hcon->le_per_adv_data_offset + ev3->length > HCI_MAX_PER_AD_TOT_LEN) goto done; memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset, ev3->data, ev3->length); hcon->le_per_adv_data_offset += ev3->length; if (ev3->data_status == LE_PA_DATA_COMPLETE) { /* All PA data has been received. */ hcon->le_per_adv_data_len = hcon->le_per_adv_data_offset; hcon->le_per_adv_data_offset = 0; /* Extract BASE */ base = eir_get_service_data(hcon->le_per_adv_data, hcon->le_per_adv_data_len, EIR_BAA_SERVICE_UUID, &base_len); if (!base || base_len > BASE_MAX_LENGTH) goto done; memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. */ hcon->le_per_adv_data_len = 0; } } else { sk = iso_get_sock(hdev, &hdev->bdaddr, BDADDR_ANY, BT_LISTEN, iso_match_dst, BDADDR_ANY); } done: if (!sk) return 0; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; sock_put(sk); return HCI_LM_ACCEPT; } static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && hcon->type != PA_LINK) { if (hcon->type != LE_LINK) return; /* Check if LE link has failed */ if (status) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &hcon->link_list, list) iso_conn_del(link->conn, bt_to_errno(status)); return; } /* Create CIS if pending */ hci_le_create_cis_pending(hcon->hdev); return; } BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection * into the accept queue of the listening socket and wake up * userspace, to inform the user about the event. */ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); if (conn) iso_conn_ready(conn); } else { iso_conn_del(hcon, bt_to_errno(status)); } } static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && hcon->type != PA_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); iso_conn_del(hcon, bt_to_errno(reason)); } int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) { struct hci_conn *hcon; struct iso_conn *conn; struct skb_shared_hwtstamps *hwts; __u16 pb, ts, len, sn; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_handle(hdev, handle); if (!hcon) { hci_dev_unlock(hdev); kfree_skb(skb); return -ENOENT; } conn = iso_conn_hold_unless_zero(hcon->iso_data); hcon = NULL; hci_dev_unlock(hdev); if (!conn) { kfree_skb(skb); return -EINVAL; } pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts); switch (pb) { case ISO_START: case ISO_SINGLE: if (conn->rx_len) { BT_ERR("Unexpected start frame (len %d)", skb->len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; } if (ts) { struct hci_iso_ts_data_hdr *hdr; hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } /* Record the timestamp to skb */ hwts = skb_hwtstamps(skb); hwts->hwtstamp = us_to_ktime(le32_to_cpu(hdr->ts)); sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d", len, skb->len, flags, sn); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); goto done; } if (pb == ISO_SINGLE) { BT_ERR("Frame malformed (len %d, expected len %d)", skb->len, len); goto drop; } if (skb->len > len) { BT_ERR("Frame is too long (len %d, expected len %d)", skb->len, len); goto drop; } /* Allocate skb for the complete frame (with header) */ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); if (!conn->rx_skb) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; hci_skb_pkt_seqnum(conn->rx_skb) = sn; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; /* Copy hw timestamp from skb to rx_skb if present */ if (ts) { hwts = skb_hwtstamps(conn->rx_skb); hwts->hwtstamp = skb_hwtstamps(skb)->hwtstamp; } break; case ISO_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); if (!conn->rx_len) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); goto drop; } if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; goto drop; } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; break; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { struct sk_buff *rx_skb = conn->rx_skb; /* Complete frame received. iso_recv_frame * takes ownership of the skb so set the global * rx_skb pointer to NULL first. */ conn->rx_skb = NULL; iso_recv_frame(conn, rx_skb); } break; } drop: kfree_skb(skb); done: iso_conn_put(conn); return 0; } static struct hci_cb iso_cb = { .name = "ISO", .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; static int iso_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src, &iso_pi(sk)->dst, sk->sk_state); } read_unlock(&iso_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(iso_debugfs); static struct dentry *iso_debugfs; static const struct proto_ops iso_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = iso_sock_release, .bind = iso_sock_bind, .connect = iso_sock_connect, .listen = iso_sock_listen, .accept = iso_sock_accept, .getname = iso_sock_getname, .sendmsg = iso_sock_sendmsg, .recvmsg = iso_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = iso_sock_shutdown, .setsockopt = iso_sock_setsockopt, .getsockopt = iso_sock_getsockopt }; static const struct net_proto_family iso_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = iso_sock_create, }; static bool inited; bool iso_inited(void) { return inited; } int iso_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops); if (err < 0) { BT_ERR("ISO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create ISO proc file"); bt_sock_unregister(BTPROTO_ISO); goto error; } BT_INFO("ISO socket layer initialized"); hci_register_cb(&iso_cb); if (!IS_ERR_OR_NULL(bt_debugfs)) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); inited = true; return 0; error: proto_unregister(&iso_proto); return err; } int iso_exit(void) { if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); debugfs_remove(iso_debugfs); iso_debugfs = NULL; hci_unregister_cb(&iso_cb); bt_sock_unregister(BTPROTO_ISO); proto_unregister(&iso_proto); inited = false; return 0; }
1388 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_DEFS_H #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> #include <linux/mem_encrypt.h> #include <asm/page_types.h> #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ #define _PAGE_BIT_USER 2 /* userspace addressable */ #define _PAGE_BIT_PWT 3 /* page write through */ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SOFTW4 57 /* available for programmer */ #define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ #define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #ifdef CONFIG_X86_64 #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */ #else /* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */ #endif /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3) #else #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) #else #define _PAGE_KNL_ERRATUM_MASK 0 #endif #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 1-4 are *not* involved * into swap entry computation, but bit 7 is used for thp migration, * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) #define _PAGE_SWP_UFFD_WP _PAGE_USER #else #define _PAGE_UFFD_WP (_AT(pteval_t, 0)) #define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif /* * The hardware requires shadow stack to be Write=0,Dirty=1. However, * there are valid cases where the kernel might create read-only PTEs that * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have * (Write=0,SavedDirty=1,Dirty=0) set. */ #define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) #define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW) /* * Set of bits not changed in pte_modify. The pte's * protection key is treated like _PAGE_RW, for * instance, and is *not* included in this mask since * pte_modify() does modify it. */ #define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | \ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ _PAGE_CC | _PAGE_UFFD_WP) #define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. * * The resulting bits for PWT, PCD and PAT should be chosen in a way * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ #ifndef __ASSEMBLER__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, _PAGE_CACHE_MODE_UC_MINUS = 2, _PAGE_CACHE_MODE_UC = 3, _PAGE_CACHE_MODE_WT = 4, _PAGE_CACHE_MODE_WP = 5, _PAGE_CACHE_MODE_NUM = 8 }; #endif #define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define __PP _PAGE_PRESENT #define __RW _PAGE_RW #define _USR _PAGE_USER #define ___A _PAGE_ACCESSED #define ___D _PAGE_DIRTY #define ___G _PAGE_GLOBAL #define __NX _PAGE_NX #define _ENC _PAGE_ENC #define __WP _PAGE_CACHE_WP #define __NC _PAGE_NOCACHE #define _PSE _PAGE_PSE #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) /* * Page tables needs to have Write=1 in order for any lower PTEs to be * writable. This includes shadow stack memory (Write=0, Dirty=1) */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) #define __PAGE_KERNEL_IO __PAGE_KERNEL #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE #ifndef __ASSEMBLER__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0) #define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask) #define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC) #define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0) #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) #define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) #define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC) #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLER__ */ /* * early identity mapping pte attrib macros. */ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif #ifdef CONFIG_X86_32 # include <asm/pgtable_32_types.h> #else # include <asm/pgtable_64_types.h> #endif #ifndef __ASSEMBLER__ #include <linux/types.h> /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) /* * Extracts the flags from a (pte|pmd|pud|pgd)val_t * This includes the protection key value. */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; static inline pgprot_t pgprot_nx(pgprot_t prot) { return __pgprot(pgprot_val(prot) | _PAGE_NX); } #define pgprot_nx pgprot_nx #ifdef CONFIG_X86_PAE /* * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't * use it here. */ #define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) #define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) /* * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. * All other bits are Reserved MBZ */ #define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ _PAGE_PWT | _PAGE_PCD | \ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) #else /* No need to mask any bits for !PAE */ #define PGD_ALLOWED_BITS (~0ULL) #endif static inline pgd_t native_make_pgd(pgdval_t val) { return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) { return native_pgd_val(pgd) & PTE_FLAGS_MASK; } #if CONFIG_PGTABLE_LEVELS > 4 typedef struct { p4dval_t p4d; } p4d_t; static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { val }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return p4d.p4d; } #else #include <asm-generic/pgtable-nop4d.h> static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) { return (pud_t) { val }; } static inline pudval_t native_pud_val(pud_t pud) { return pud.pud; } #else #include <asm-generic/pgtable-nopud.h> static inline pud_t native_make_pud(pudval_t val) { return (pud_t) { .p4d.pgd = native_make_pgd(val) }; } static inline pudval_t native_pud_val(pud_t pud) { return native_pgd_val(pud.p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return pmd.pmd; } #else #include <asm-generic/pgtable-nopmd.h> static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return native_pgd_val(pmd.pud.p4d.pgd); } #endif static inline p4dval_t p4d_pfn_mask(p4d_t p4d) { /* No 512 GiB huge pages yet */ return PTE_PFN_MASK; } static inline p4dval_t p4d_flags_mask(p4d_t p4d) { return ~p4d_pfn_mask(p4d); } static inline p4dval_t p4d_flags(p4d_t p4d) { return native_p4d_val(p4d) & p4d_flags_mask(p4d); } static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) { return (pte_t) { .pte = val }; } static inline pteval_t native_pte_val(pte_t pte) { return pte.pte; } static inline pteval_t pte_flags(pte_t pte) { return native_pte_val(pte) & PTE_FLAGS_MASK; } #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ (((cb) >> _PAGE_BIT_PWT) & 1)) #define __cm_idx2pte(i) \ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) unsigned long cachemode2protval(enum page_cache_mode pcm); static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); #define pgprot_writethrough pgprot_writethrough extern pgprot_t pgprot_writethrough(pgprot_t prot); /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else #define native_pagetable_init paging_init #endif enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, PG_LEVEL_256T, PG_LEVEL_NUM }; #ifdef CONFIG_PROC_FS extern void update_page_count(int level, unsigned long pages); #else static inline void update_page_count(int level, unsigned long pages) { } #endif /* * Helper function that returns the kernel pagetable entry controlling * the virtual address 'address'. NULL means no pagetable entry present. * NOTE: the return type is pte_t but if the pmd is PSE then we return it * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */
223 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi.h - ACPI Interface * * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H #include <linux/cleanup.h> #include <linux/errno.h> #include <linux/ioport.h> /* for struct resource */ #include <linux/resource_ext.h> #include <linux/device.h> #include <linux/mod_devicetable.h> #include <linux/property.h> #include <linux/uuid.h> #include <linux/node.h> struct irq_domain; struct irq_domain_ops; #ifndef _LINUX #define _LINUX #endif #include <acpi/acpi.h> #include <acpi/acpi_numa.h> #ifdef CONFIG_ACPI #include <linux/list.h> #include <linux/dynamic_debug.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/fw_table.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> #include <acpi/acpi_io.h> #include <asm/acpi.h> #ifdef CONFIG_ACPI_TABLE_LIB #define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "ACPI") #define __init_or_acpilib #define __initdata_or_acpilib #else #define EXPORT_SYMBOL_ACPI_LIB(x) #define __init_or_acpilib __init #define __initdata_or_acpilib __initdata #endif static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return adev ? adev->handle : NULL; } #define ACPI_COMPANION(dev) to_acpi_device_node((dev)->fwnode) #define ACPI_COMPANION_SET(dev, adev) set_primary_fwnode(dev, (adev) ? \ acpi_fwnode_handle(adev) : NULL) #define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) #define ACPI_HANDLE_FWNODE(fwnode) \ acpi_device_handle(to_acpi_device_node(fwnode)) static inline struct fwnode_handle *acpi_alloc_fwnode_static(void) { struct fwnode_handle *fwnode; fwnode = kzalloc_obj(struct fwnode_handle); if (!fwnode) return NULL; fwnode_init(fwnode, &acpi_static_fwnode_ops); return fwnode; } static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode) { if (WARN_ON(!is_acpi_static_node(fwnode))) return; kfree(fwnode); } static inline bool has_acpi_companion(struct device *dev) { return is_acpi_device_node(dev->fwnode); } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false)); } static inline const char *acpi_dev_name(struct acpi_device *adev) { return dev_name(&adev->dev); } struct device *acpi_get_first_physical_node(struct acpi_device *adev); enum acpi_irq_model_id { ACPI_IRQ_MODEL_PIC = 0, ACPI_IRQ_MODEL_IOAPIC, ACPI_IRQ_MODEL_IOSAPIC, ACPI_IRQ_MODEL_PLATFORM, ACPI_IRQ_MODEL_GIC, ACPI_IRQ_MODEL_GIC_V5, ACPI_IRQ_MODEL_LPIC, ACPI_IRQ_MODEL_RINTC, ACPI_IRQ_MODEL_COUNT }; extern enum acpi_irq_model_id acpi_irq_model; enum acpi_interrupt_id { ACPI_INTERRUPT_PMI = 1, ACPI_INTERRUPT_INIT, ACPI_INTERRUPT_CPEI, ACPI_INTERRUPT_COUNT }; #define ACPI_SPACE_MEM 0 enum acpi_address_range_id { ACPI_ADDRESS_RANGE_MEMORY = 1, ACPI_ADDRESS_RANGE_RESERVED = 2, ACPI_ADDRESS_RANGE_ACPI = 3, ACPI_ADDRESS_RANGE_NVS = 4, ACPI_ADDRESS_RANGE_COUNT }; /* Table Handlers */ typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); /* Debugger support */ struct acpi_debugger_ops { int (*create_thread)(acpi_osd_exec_callback function, void *context); ssize_t (*write_log)(const char *msg); ssize_t (*read_cmd)(char *buffer, size_t length); int (*wait_command_ready)(bool single_step, char *buffer, size_t length); int (*notify_command_complete)(void); }; struct acpi_debugger { const struct acpi_debugger_ops *ops; struct module *owner; struct mutex lock; }; #ifdef CONFIG_ACPI_DEBUGGER int __init acpi_debugger_init(void); int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops); void acpi_unregister_debugger(const struct acpi_debugger_ops *ops); int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context); ssize_t acpi_debugger_write_log(const char *msg); ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length); int acpi_debugger_wait_command_ready(void); int acpi_debugger_notify_command_complete(void); #else static inline int acpi_debugger_init(void) { return -ENODEV; } static inline int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops) { return -ENODEV; } static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops) { } static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context) { return -ENODEV; } static inline int acpi_debugger_write_log(const char *msg) { return -ENODEV; } static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length) { return -ENODEV; } static inline int acpi_debugger_wait_command_ready(void) { return -ENODEV; } static inline int acpi_debugger_notify_command_complete(void) { return -ENODEV; } #endif #define BAD_MADT_ENTRY(entry, end) ( \ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) void __iomem *__acpi_map_table(unsigned long phys, unsigned long size); void __acpi_unmap_table(void __iomem *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); void acpi_boot_table_prepare (void); void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); int acpi_locate_initial_tables (void); void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance) { struct acpi_table_header *table; int status = acpi_get_table(signature, instance, &table); if (ACPI_FAILURE(status)) return ERR_PTR(-ENOENT); return table; } DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_entries_array(char *id, unsigned long table_size, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); int acpi_table_parse_madt(enum acpi_madt_type id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_cedt(enum acpi_cedt_type id, acpi_tbl_entry_handler_arg handler_arg, void *arg); int acpi_parse_mcfg (struct acpi_table_header *header); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); #else static inline void acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } #endif void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); #if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) void acpi_arch_dma_setup(struct device *dev); #else static inline void acpi_arch_dma_setup(struct device *dev) { } #endif #ifdef CONFIG_ARM64 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); #else static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } #endif #ifdef CONFIG_RISCV void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa); #else static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { } #endif #ifndef PHYS_CPUID_INVALID typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) #endif static inline bool invalid_logical_cpuid(u32 cpuid) { return (int)cpuid < 0; } static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) { return phys_id == PHYS_CPUID_INVALID; } int __init acpi_get_madt_revision(void); /* Validate the processor object's proc_id */ bool acpi_duplicate_processor_id(int proc_id); /* Processor _CTS control */ struct acpi_processor_power; #ifdef CONFIG_ACPI_PROCESSOR_CSTATE bool acpi_processor_claim_cst_control(void); int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info); #else static inline bool acpi_processor_claim_cst_control(void) { return false; } static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu); int acpi_unmap_cpu(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ acpi_handle acpi_get_processor_handle(int cpu); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base); int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base); int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base); void acpi_irq_stats_init(void); extern u32 acpi_irq_handled; extern u32 acpi_irq_not_handled; extern unsigned int acpi_sci_irq; extern bool acpi_no_s5; #define INVALID_ACPI_IRQ ((unsigned)-1) static inline bool acpi_sci_irq_valid(void) { return acpi_sci_irq != INVALID_ACPI_IRQ; } extern int sbf_port; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); typedef struct fwnode_handle *(*acpi_gsi_domain_disp_fn)(u32); void acpi_set_irq_model(enum acpi_irq_model_id model, acpi_gsi_domain_disp_fn fn); acpi_gsi_domain_disp_fn acpi_get_gsi_dispatcher(void); void acpi_set_gsi_to_irq_fallback(u32 (*)(u32)); struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags, unsigned int size, struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data); #ifdef CONFIG_X86_IO_APIC extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); #else static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) { return -1; } #endif /* * This function undoes the effect of one call to acpi_register_gsi(). * If this matches the last registration, any IRQ resources for gsi * are freed. */ void acpi_unregister_gsi (u32 gsi); struct pci_dev; struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin); int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); #ifdef CONFIG_PCI void acpi_penalize_sci_irq(int irq, int trigger, int polarity); #else static inline void acpi_penalize_sci_irq(int irq, int trigger, int polarity) { } #endif void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); extern int ec_write(u8 addr, u8 val); extern int ec_transaction(u8 command, const u8 *wdata, unsigned wdata_len, u8 *rdata, unsigned rdata_len); extern acpi_handle ec_get_handle(void); extern bool acpi_is_pnp_device(struct acpi_device *); #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE) typedef void (*wmi_notify_handler) (union acpi_object *data, void *context); int wmi_instance_count(const char *guid); extern acpi_status wmi_evaluate_method(const char *guid, u8 instance, u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out); extern acpi_status wmi_query_block(const char *guid, u8 instance, struct acpi_buffer *out); extern acpi_status wmi_set_block(const char *guid, u8 instance, const struct acpi_buffer *in); extern acpi_status wmi_install_notify_handler(const char *guid, wmi_notify_handler handler, void *data); extern acpi_status wmi_remove_notify_handler(const char *guid); extern bool wmi_has_guid(const char *guid); extern char *wmi_get_acpi_device_uid(const char *guid); #endif /* CONFIG_ACPI_WMI */ #define ACPI_VIDEO_OUTPUT_SWITCHING 0x0001 #define ACPI_VIDEO_DEVICE_POSTING 0x0002 #define ACPI_VIDEO_ROM_AVAILABLE 0x0004 #define ACPI_VIDEO_BACKLIGHT 0x0008 #define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR 0x0010 #define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO 0x0020 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR 0x0040 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO 0x0080 #define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR 0x0100 #define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO 0x0200 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR 0x0400 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO 0x0800 extern char acpi_video_backlight_string[]; extern long acpi_is_video_device(acpi_handle handle); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_THERMAL_LIB int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); #endif #ifdef CONFIG_ACPI_HMAT int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord); #else static inline int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); /** * pxm_to_online_node - Map proximity ID to online node * @pxm: ACPI proximity ID * * This is similar to pxm_to_node(), but always returns an online * node. When the mapped node from a given proximity ID is offline, it * looks up the node distance table and returns the nearest online node. * * ACPI device drivers, which are called after the NUMA initialization has * completed in the kernel, can call this interface to obtain their device * NUMA topology from ACPI tables. Such drivers do not have to deal with * offline nodes. A node may be offline when SRAT memory entry does not exist, * or NUMA is disabled, ex. "numa=off" on x86. */ static inline int pxm_to_online_node(int pxm) { int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } #else static inline int pxm_to_online_node(int pxm) { return 0; } static inline int acpi_map_pxm_to_node(int pxm) { return 0; } static inline int acpi_get_node(acpi_handle handle) { return 0; } #endif extern int pnpacpi_disabled; #define PXM_INVAL (-1) bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_address_space(struct acpi_resource *ares, struct resource_win *win); bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares, struct resource_win *win); unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable); unsigned int acpi_dev_get_irq_type(int triggering, int polarity); bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index, struct resource *res); void acpi_dev_free_resource_list(struct list_head *list); int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list, int (*preproc)(struct acpi_resource *, void *), void *preproc_data); int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_filter_resource_type(struct acpi_resource *ares, unsigned long types); static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares, void *arg) { return acpi_dev_filter_resource_type(ares, (unsigned long)arg); } struct acpi_device *acpi_resource_consumer(struct resource *res); int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION extern int acpi_check_s4_hw_signature; #endif #ifdef CONFIG_PM_SLEEP void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); void __init acpi_nvs_nosave_s3(void); void __init acpi_sleep_no_blacklist(void); #endif /* CONFIG_PM_SLEEP */ int acpi_register_wakeup_handler( int wake_irq, bool (*wakeup)(void *context), void *context); void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context); struct acpi_osc_context { char *uuid_str; /* UUID string */ int rev; struct acpi_buffer cap; /* list of DWORD capabilities */ struct acpi_buffer ret; /* free by caller if success */ }; acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); /* Number of _OSC capability DWORDS depends on bridge type */ #define OSC_PCI_CAPABILITY_DWORDS 3 #define OSC_CXL_CAPABILITY_DWORDS 5 /* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */ #define OSC_QUERY_DWORD 0 /* DWORD 1 */ #define OSC_SUPPORT_DWORD 1 /* DWORD 2 */ #define OSC_CONTROL_DWORD 2 /* DWORD 3 */ #define OSC_EXT_SUPPORT_DWORD 3 /* DWORD 4 */ #define OSC_EXT_CONTROL_DWORD 4 /* DWORD 5 */ /* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */ #define OSC_QUERY_ENABLE 0x00000001 /* input */ #define OSC_REQUEST_ERROR 0x00000002 /* return */ #define OSC_INVALID_UUID_ERROR 0x00000004 /* return */ #define OSC_INVALID_REVISION_ERROR 0x00000008 /* return */ #define OSC_CAPABILITIES_MASK_ERROR 0x00000010 /* return */ /* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */ #define OSC_SB_PAD_SUPPORT 0x00000001 #define OSC_SB_PPC_OST_SUPPORT 0x00000002 #define OSC_SB_PR3_SUPPORT 0x00000004 #define OSC_SB_HOTPLUG_OST_SUPPORT 0x00000008 #define OSC_SB_APEI_SUPPORT 0x00000010 #define OSC_SB_CPC_SUPPORT 0x00000020 #define OSC_SB_CPCV2_SUPPORT 0x00000040 #define OSC_SB_PCLPI_SUPPORT 0x00000080 #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT 0x00000200 #define OSC_SB_OVER_16_PSTATES_SUPPORT 0x00000400 #define OSC_SB_GED_SUPPORT 0x00000800 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 #define OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT 0x00002000 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00020000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT 0x00080000 #define OSC_SB_PRM_SUPPORT 0x00200000 #define OSC_SB_FFH_OPR_SUPPORT 0x00400000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; extern bool osc_sb_cppc2_support_acked; extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ #define OSC_USB_USB3_TUNNELING 0x00000001 #define OSC_USB_DP_TUNNELING 0x00000002 #define OSC_USB_PCIE_TUNNELING 0x00000004 #define OSC_USB_XDOMAIN 0x00000008 extern u32 osc_sb_native_usb4_control; /* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */ #define OSC_PCI_EXT_CONFIG_SUPPORT 0x00000001 #define OSC_PCI_ASPM_SUPPORT 0x00000002 #define OSC_PCI_CLOCK_PM_SUPPORT 0x00000004 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT 0x00000008 #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 #define OSC_PCI_SHPC_NATIVE_HP_CONTROL 0x00000002 #define OSC_PCI_EXPRESS_PME_CONTROL 0x00000004 #define OSC_PCI_EXPRESS_AER_CONTROL 0x00000008 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 /* CXL _OSC: Capabilities DWORD 4: Support Field */ #define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT 0x00000001 #define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT 0x00000002 #define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT 0x00000004 #define OSC_CXL_NATIVE_HP_SUPPORT 0x00000008 /* CXL _OSC: Capabilities DWORD 5: Control Field */ #define OSC_CXL_ERROR_REPORTING_CONTROL 0x00000001 static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_CONTROL_DWORD]; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_EXT_CONTROL_DWORD]; } #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 #define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006 #define ACPI_GSB_ACCESS_ATTRIB_WORD 0x00000008 #define ACPI_GSB_ACCESS_ATTRIB_BLOCK 0x0000000A #define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE 0x0000000B #define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL 0x0000000C #define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL 0x0000000D #define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES 0x0000000E #define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS 0x0000000F /* Enable _OST when all relevant hotplug operations are enabled */ #if defined(CONFIG_ACPI_HOTPLUG_CPU) && \ defined(CONFIG_ACPI_HOTPLUG_MEMORY) && \ defined(CONFIG_ACPI_CONTAINER) #define ACPI_HOTPLUG_OST #endif /* _OST Source Event Code (OSPM Action) */ #define ACPI_OST_EC_OSPM_SHUTDOWN 0x100 #define ACPI_OST_EC_OSPM_EJECT 0x103 #define ACPI_OST_EC_OSPM_INSERTION 0x200 /* _OST General Processing Status Code */ #define ACPI_OST_SC_SUCCESS 0x0 #define ACPI_OST_SC_NON_SPECIFIC_FAILURE 0x1 #define ACPI_OST_SC_UNRECOGNIZED_NOTIFY 0x2 /* _OST OS Shutdown Processing (0x100) Status Code */ #define ACPI_OST_SC_OS_SHUTDOWN_DENIED 0x80 #define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS 0x81 #define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED 0x82 #define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED 0x83 /* _OST Ejection Request (0x3, 0x103) Status Code */ #define ACPI_OST_SC_EJECT_NOT_SUPPORTED 0x80 #define ACPI_OST_SC_DEVICE_IN_USE 0x81 #define ACPI_OST_SC_DEVICE_BUSY 0x82 #define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY 0x83 #define ACPI_OST_SC_EJECT_IN_PROGRESS 0x84 /* _OST Insertion Request (0x200) Status Code */ #define ACPI_OST_SC_INSERT_IN_PROGRESS 0x80 #define ACPI_OST_SC_DRIVER_LOAD_FAILURE 0x81 #define ACPI_OST_SC_INSERT_NOT_SUPPORTED 0x82 enum acpi_predicate { all_versions, less_than_or_equal, equal, greater_than_or_equal, }; /* Table must be terminted by a NULL entry */ struct acpi_platform_list { char oem_id[ACPI_OEM_ID_SIZE+1]; char oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1]; u32 oem_revision; char *table; enum acpi_predicate pred; char *reason; u32 data; }; int acpi_match_platform_list(const struct acpi_platform_list *plat); extern void acpi_early_init(void); extern void acpi_subsystem_init(void); extern int acpi_nvs_register(__u64 start, __u64 size); extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data); const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev); const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev); const void *acpi_device_get_match_data(const struct device *dev); extern bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv); int acpi_device_uevent_modalias(const struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); struct platform_device *acpi_create_platform_device(struct acpi_device *, const struct property_entry *); #define ACPI_PTR(_ptr) (_ptr) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { adev->flags.visited = true; } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { adev->flags.visited = false; } enum acpi_reconfig_event { ACPI_RECONFIG_DEVICE_ADD = 0, ACPI_RECONFIG_DEVICE_REMOVE, }; int acpi_reconfig_notifier_register(struct notifier_block *nb); int acpi_reconfig_notifier_unregister(struct notifier_block *nb); #ifdef CONFIG_ACPI_GTDT int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); int acpi_gtdt_map_ppi(int type); bool acpi_gtdt_c3stop(int type); #endif #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER static __always_inline void acpi_arch_set_root_pointer(u64 addr) { } #endif #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER static __always_inline u64 acpi_arch_get_root_pointer(void) { return 0; } #endif int acpi_get_local_u64_address(acpi_handle handle, u64 *addr); int acpi_get_local_address(acpi_handle handle, u32 *addr); const char *acpi_get_subsystem_id(acpi_handle handle); #ifdef CONFIG_ACPI_MRRM int acpi_mrrm_max_mem_region(void); #endif #define ACPI_CMOS_RTC_IDS \ { "PNP0B00", }, \ { "PNP0B01", }, \ { "PNP0B02", }, \ { "", } extern bool cmos_rtc_platform_device_present; #else /* !CONFIG_ACPI */ #define acpi_disabled 1 #define ACPI_COMPANION(dev) (NULL) #define ACPI_COMPANION_SET(dev, adev) do { } while (0) #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) /* Get rid of the -Wunused-variable for adev */ #define acpi_dev_uid_match(adev, uid2) (adev && false) #define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) { return false; } static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) { return false; } struct acpi_device; static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; } static inline struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) { return NULL; } static inline bool acpi_reduced_hardware(void) { return false; } static inline void acpi_dev_put(struct acpi_device *adev) {} static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return false; } static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return NULL; } static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return NULL; } static inline bool has_acpi_companion(struct device *dev) { return false; } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { } static inline const char *acpi_dev_name(struct acpi_device *adev) { return NULL; } static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev) { return NULL; } static inline void acpi_early_init(void) { } static inline void acpi_subsystem_init(void) { } static inline int early_acpi_boot_init(void) { return 0; } static inline int acpi_boot_init(void) { return 0; } static inline void acpi_boot_table_prepare(void) { } static inline void acpi_boot_table_init(void) { } static inline int acpi_mps_check(void) { return 0; } static inline int acpi_check_resource_conflict(struct resource *res) { return 0; } static inline int acpi_check_region(resource_size_t start, resource_size_t n, const char *name) { return 0; } struct acpi_table_header; static inline int acpi_table_parse(char *id, int (*handler)(struct acpi_table_header *)) { return -ENODEV; } static inline int acpi_nvs_register(__u64 start, __u64 size) { return 0; } static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data) { return 0; } struct acpi_device_id; static inline const struct acpi_device_id *acpi_match_acpi_device( const struct acpi_device_id *ids, const struct acpi_device *adev) { return NULL; } static inline const struct acpi_device_id *acpi_match_device( const struct acpi_device_id *ids, const struct device *dev) { return NULL; } static inline const void *acpi_device_get_match_data(const struct device *dev) { return NULL; } static inline bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { return false; } static inline bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs) { return false; } static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4) { return NULL; } static inline union acpi_object *acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { return NULL; } static inline int acpi_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { return -ENODEV; } static inline int acpi_device_modalias(struct device *dev, char *buf, int size) { return -ENODEV; } static inline struct platform_device * acpi_create_platform_device(struct acpi_device *adev, const struct property_entry *properties) { return NULL; } static inline bool acpi_dma_supported(const struct acpi_device *adev) { return false; } static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) { return DEV_DMA_NOT_SUPPORTED; } static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) { return -ENODEV; } static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return 0; } static inline int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id) { return 0; } #define ACPI_PTR(_ptr) (NULL) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { } static inline int acpi_reconfig_notifier_register(struct notifier_block *nb) { return -EINVAL; } static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb) { return -EINVAL; } static inline struct acpi_device *acpi_resource_consumer(struct resource *res) { return NULL; } static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) { return -ENODEV; } static inline const char *acpi_get_subsystem_id(acpi_handle handle) { return ERR_PTR(-ENODEV); } static inline int acpi_register_wakeup_handler(int wake_irq, bool (*wakeup)(void *context), void *context) { return -ENXIO; } static inline void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context) { } struct acpi_osc_context; static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { return 0; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { return 0; } static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } static inline acpi_handle acpi_get_processor_handle(int cpu) { return NULL; } static inline int acpi_mrrm_max_mem_region(void) { return 1; } #define cmos_rtc_platform_device_present false #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, resource_size_t *size); #else static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, resource_size_t *size) { return -EOPNOTSUPP; } #endif extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_ioapic_add(acpi_handle root); #else static inline int acpi_ioapic_add(acpi_handle root) { return 0; } #endif #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, u32 val_a, u32 val_b)); acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); void (*check)(void); void (*restore)(void); }; #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); #else /* CONFIG_SUSPEND && CONFIG_X86 */ static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) { return -ENODEV; } static inline void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg) { } #endif /* CONFIG_SUSPEND && CONFIG_X86 */ void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0) #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM) int acpi_dev_suspend(struct device *dev, bool wakeup); int acpi_dev_resume(struct device *dev); int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); bool acpi_storage_d3(struct device *dev); bool acpi_dev_state_d0(struct device *dev); #else static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } static inline int acpi_dev_pm_attach(struct device *dev, bool power_on) { return 0; } static inline bool acpi_storage_d3(struct device *dev) { return false; } static inline bool acpi_dev_state_d0(struct device *dev) { return true; } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP) int acpi_subsys_prepare(struct device *dev); void acpi_subsys_complete(struct device *dev); int acpi_subsys_suspend_late(struct device *dev); int acpi_subsys_suspend_noirq(struct device *dev); int acpi_subsys_suspend(struct device *dev); int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); int acpi_subsys_restore_early(struct device *dev); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } static inline void acpi_subsys_complete(struct device *dev) {} static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; } static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; } static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } static inline int acpi_subsys_restore_early(struct device *dev) { return 0; } #endif #if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP) void acpi_ec_mark_gpe_for_wake(void); void acpi_ec_set_gpe_wake_mask(u8 action); #else static inline void acpi_ec_mark_gpe_for_wake(void) {} static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif #ifdef CONFIG_ACPI char *acpi_handle_path(acpi_handle handle); __printf(3, 4) void acpi_handle_printk(const char *level, acpi_handle handle, const char *fmt, ...); void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status); #else /* !CONFIG_ACPI */ static inline __printf(3, 4) void acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {} static inline void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status) {} #endif /* !CONFIG_ACPI */ #if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG) __printf(3, 4) void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...); #endif /* * acpi_handle_<level>: Print message with ACPI prefix and object path * * These interfaces acquire the global namespace mutex to obtain an object * path. In interrupt context, it shows the object path as <n/a>. */ #define acpi_handle_emerg(handle, fmt, ...) \ acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__) #define acpi_handle_alert(handle, fmt, ...) \ acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_crit(handle, fmt, ...) \ acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_err(handle, fmt, ...) \ acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__) #define acpi_handle_warn(handle, fmt, ...) \ acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__) #define acpi_handle_notice(handle, fmt, ...) \ acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__) #define acpi_handle_info(handle, fmt, ...) \ acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__) #if defined(DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__) #else #if defined(CONFIG_DYNAMIC_DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ _dynamic_func_call(fmt, __acpi_handle_debug, \ handle, pr_fmt(fmt), ##__VA_ARGS__) #else #define acpi_handle_debug(handle, fmt, ...) \ ({ \ if (0) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \ 0; \ }) #endif #endif #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB) bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index, bool *wake_capable); #else static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index, bool *wake_capable) { return -ENXIO; } #endif static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index, bool *wake_capable) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable); } static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *con_id, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, con_id, index, NULL); } static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL); } /* Device properties */ #ifdef CONFIG_ACPI int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj); int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args); static inline int acpi_node_get_property_reference( const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return __acpi_node_get_property_reference(fwnode, name, index, NR_FWNODE_REFERENCE_ARGS, args); } static inline bool acpi_dev_has_props(const struct acpi_device *adev) { return !list_empty(&adev->data.properties); } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties); int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); #define ACPI_TABLE_ID_LEN 5 /** * struct acpi_probe_entry - boot-time probing entry * @id: ACPI table name * @type: Optional subtable type to match * (if @id contains subtables) * @subtable_valid: Optional callback to check the validity of * the subtable * @probe_table: Callback to the driver being probed when table * match is successful * @probe_subtbl: Callback to the driver being probed when table and * subtable match (and optional callback is successful) * @driver_data: Sideband data provided back to the driver */ struct acpi_probe_entry { __u8 id[ACPI_TABLE_ID_LEN]; __u8 type; acpi_probe_entry_validate_subtbl subtable_valid; union { acpi_tbl_table_handler probe_table; acpi_tbl_entry_handler probe_subtbl; }; kernel_ulong_t driver_data; }; void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr); #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, \ valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_table = fn, \ .driver_data = data, \ } #define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id, \ subtable, valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_subtbl = fn, \ .driver_data = data, \ } #define ACPI_PROBE_TABLE(name) __##name##_acpi_probe_table #define ACPI_PROBE_TABLE_END(name) __##name##_acpi_probe_table_end int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr); #define acpi_probe_device_table(t) \ ({ \ extern struct acpi_probe_entry ACPI_PROBE_TABLE(t), \ ACPI_PROBE_TABLE_END(t); \ __acpi_probe_device_table(&ACPI_PROBE_TABLE(t), \ (&ACPI_PROBE_TABLE_END(t) - \ &ACPI_PROBE_TABLE(t))); \ }) #else static inline int acpi_dev_get_property(struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return -ENXIO; } static inline int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return -ENXIO; } static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { return ERR_PTR(-ENXIO); } static inline int acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle **remote, struct fwnode_handle **port, struct fwnode_handle **endpoint) { return -ENXIO; } #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \ static const void * __acpi_table_##name[] \ __attribute__((unused)) \ = { (void *) table_id, \ (void *) subtable, \ (void *) valid, \ (void *) fn, \ (void *) data } #define acpi_probe_device_table(t) ({ int __r = 0; __r;}) #endif #ifdef CONFIG_ACPI_TABLE_UPGRADE void acpi_table_upgrade(void); #else static inline void acpi_table_upgrade(void) { } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG) extern bool acpi_has_watchdog(void); #else static inline bool acpi_has_watchdog(void) { return false; } #endif #ifdef CONFIG_ACPI_SPCR_TABLE extern bool qdf2400_e44_present; int acpi_parse_spcr(bool enable_earlycon, bool enable_console); #else static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) { return -ENODEV; } #endif #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, unsigned int index); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, unsigned int index) { return NULL; } #endif #ifdef CONFIG_ACPI_LPIT int lpit_read_residency_count_address(u64 *address); #else static inline int lpit_read_residency_count_address(u64 *address) { return -EINVAL; } #endif #ifdef CONFIG_ACPI_PROCESSOR_IDLE #ifndef arch_get_idle_state_flags static inline unsigned int arch_get_idle_state_flags(u32 arch_flags) { return 0; } #endif #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); int find_acpi_cache_level_from_id(u32 cache_id); int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) { } static inline int find_acpi_cache_level_from_id(u32 cache_id) { return -ENOENT; } static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus) { return -ENOENT; } #endif void acpi_arch_init(void); #ifdef CONFIG_ACPI_PCC void acpi_init_pcc(void); #else static inline void acpi_init_pcc(void) { } #endif #ifdef CONFIG_ACPI_FFH void acpi_init_ffh(void); extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt); extern int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context); #else static inline void acpi_init_ffh(void) { } #endif #ifdef CONFIG_ACPI extern void acpi_device_notify(struct device *dev); extern void acpi_device_notify_remove(struct device *dev); #else static inline void acpi_device_notify(struct device *dev) { } static inline void acpi_device_notify_remove(struct device *dev) { } #endif static inline void acpi_use_parent_companion(struct device *dev) { ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } #ifdef CONFIG_ACPI_NUMA bool acpi_node_backed_by_real_pxm(int nid); #else static inline bool acpi_node_backed_by_real_pxm(int nid) { return false; } #endif #endif /*_LINUX_ACPI_H*/
4 4 4 4 6 2 3 1 3 1 4 4 6 6 8 8 8 8 7 1 6 2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 // SPDX-License-Identifier: GPL-2.0-only /* * Hauppauge HD PVR USB driver * * Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2008 Janne Grunau (j@jannau.net) * Copyright (C) 2008 John Poet */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/usb.h> #include <linux/mutex.h> #include <linux/i2c.h> #include <linux/videodev2.h> #include <media/v4l2-dev.h> #include <media/v4l2-common.h> #include "hdpvr.h" static int video_nr[HDPVR_MAX] = {[0 ... (HDPVR_MAX - 1)] = UNSET}; module_param_array(video_nr, int, NULL, 0); MODULE_PARM_DESC(video_nr, "video device number (-1=Auto)"); /* holds the number of currently registered devices */ static atomic_t dev_nr = ATOMIC_INIT(-1); int hdpvr_debug; module_param(hdpvr_debug, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(hdpvr_debug, "enable debugging output"); static uint default_video_input = HDPVR_VIDEO_INPUTS; module_param(default_video_input, uint, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(default_video_input, "default video input: 0=Component / 1=S-Video / 2=Composite"); static uint default_audio_input = HDPVR_AUDIO_INPUTS; module_param(default_audio_input, uint, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(default_audio_input, "default audio input: 0=RCA back / 1=RCA front / 2=S/PDIF"); static bool boost_audio; module_param(boost_audio, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(boost_audio, "boost the audio signal"); /* table of devices that work with this driver */ static const struct usb_device_id hdpvr_table[] = { { USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID) }, { USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID1) }, { USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID2) }, { USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID3) }, { USB_DEVICE(HD_PVR_VENDOR_ID, HD_PVR_PRODUCT_ID4) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, hdpvr_table); void hdpvr_delete(struct hdpvr_device *dev) { hdpvr_free_buffers(dev); usb_put_dev(dev->udev); } static void challenge(u8 *bytes) { __le64 *i64P; u64 tmp64; uint i, idx; for (idx = 0; idx < 32; ++idx) { if (idx & 0x3) bytes[(idx >> 3) + 3] = bytes[(idx >> 2) & 0x3]; switch (idx & 0x3) { case 0x3: bytes[2] += bytes[3] * 4 + bytes[4] + bytes[5]; bytes[4] += bytes[(idx & 0x1) * 2] * 9 + 9; break; case 0x1: bytes[0] *= 8; bytes[0] += 7*idx + 4; bytes[6] += bytes[3] * 3; break; case 0x0: bytes[3 - (idx >> 3)] = bytes[idx >> 2]; bytes[5] += bytes[6] * 3; for (i = 0; i < 3; i++) bytes[3] *= bytes[3] + 1; break; case 0x2: for (i = 0; i < 3; i++) bytes[1] *= bytes[6] + 1; for (i = 0; i < 3; i++) { i64P = (__le64 *)bytes; tmp64 = le64_to_cpup(i64P); tmp64 = tmp64 + (tmp64 << (bytes[7] & 0x0f)); *i64P = cpu_to_le64(tmp64); } break; } } } /* try to init the device like the windows driver */ static int device_authorization(struct hdpvr_device *dev) { int ret, retval = -ENOMEM; char request_type = 0x38, rcv_request = 0x81; char *response; mutex_lock(&dev->usbc_mutex); ret = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0), rcv_request, 0x80 | request_type, 0x0400, 0x0003, dev->usbc_buf, 46, 10000); if (ret != 46) { v4l2_err(&dev->v4l2_dev, "unexpected answer of status request, len %d\n", ret); goto unlock; } #ifdef HDPVR_DEBUG else { v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "Status request returned, len %d: %46ph\n", ret, dev->usbc_buf); } #endif dev->fw_ver = dev->usbc_buf[1]; dev->usbc_buf[46] = '\0'; v4l2_info(&dev->v4l2_dev, "firmware version 0x%x dated %s\n", dev->fw_ver, &dev->usbc_buf[2]); if (dev->fw_ver > 0x15) { dev->options.brightness = 0x80; dev->options.contrast = 0x40; dev->options.hue = 0xf; dev->options.saturation = 0x40; dev->options.sharpness = 0x80; } switch (dev->fw_ver) { case HDPVR_FIRMWARE_VERSION: dev->flags &= ~HDPVR_FLAG_AC3_CAP; break; case HDPVR_FIRMWARE_VERSION_AC3: case HDPVR_FIRMWARE_VERSION_0X12: case HDPVR_FIRMWARE_VERSION_0X15: case HDPVR_FIRMWARE_VERSION_0X1E: dev->flags |= HDPVR_FLAG_AC3_CAP; break; default: v4l2_info(&dev->v4l2_dev, "untested firmware, the driver might not work.\n"); if (dev->fw_ver >= HDPVR_FIRMWARE_VERSION_AC3) dev->flags |= HDPVR_FLAG_AC3_CAP; else dev->flags &= ~HDPVR_FLAG_AC3_CAP; } response = dev->usbc_buf+38; #ifdef HDPVR_DEBUG v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "challenge: %8ph\n", response); #endif challenge(response); #ifdef HDPVR_DEBUG v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, " response: %8ph\n", response); #endif msleep(100); ret = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), 0xd1, 0x00 | request_type, 0x0000, 0x0000, response, 8, 10000); v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "magic request returned %d\n", ret); retval = ret != 8; unlock: mutex_unlock(&dev->usbc_mutex); return retval; } static int hdpvr_device_init(struct hdpvr_device *dev) { int ret; u8 *buf; if (device_authorization(dev)) return -EACCES; /* default options for init */ hdpvr_set_options(dev); /* set filter options */ mutex_lock(&dev->usbc_mutex); buf = dev->usbc_buf; buf[0] = 0x03; buf[1] = 0x03; buf[2] = 0x00; buf[3] = 0x00; ret = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), 0x01, 0x38, CTRL_LOW_PASS_FILTER_VALUE, CTRL_DEFAULT_INDEX, buf, 4, 1000); v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "control request returned %d\n", ret); mutex_unlock(&dev->usbc_mutex); /* enable fan and bling leds */ mutex_lock(&dev->usbc_mutex); buf[0] = 0x1; ret = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), 0xd4, 0x38, 0, 0, buf, 1, 1000); v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "control request returned %d\n", ret); /* boost analog audio */ buf[0] = boost_audio; ret = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), 0xd5, 0x38, 0, 0, buf, 1, 1000); v4l2_dbg(MSG_INFO, hdpvr_debug, &dev->v4l2_dev, "control request returned %d\n", ret); mutex_unlock(&dev->usbc_mutex); dev->status = STATUS_IDLE; return 0; } static const struct hdpvr_options hdpvr_default_options = { .video_std = HDPVR_60HZ, .video_input = HDPVR_COMPONENT, .audio_input = HDPVR_RCA_BACK, .bitrate = 65, /* 6 mbps */ .peak_bitrate = 90, /* 9 mbps */ .bitrate_mode = HDPVR_CONSTANT, .gop_mode = HDPVR_SIMPLE_IDR_GOP, .audio_codec = V4L2_MPEG_AUDIO_ENCODING_AAC, /* original picture controls for firmware version <= 0x15 */ /* updated in device_authorization() for newer firmware */ .brightness = 0x86, .contrast = 0x80, .hue = 0x80, .saturation = 0x80, .sharpness = 0x80, }; static int hdpvr_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct hdpvr_device *dev; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; #if IS_ENABLED(CONFIG_I2C) struct i2c_client *client; #endif size_t buffer_size; int i; int dev_num; int retval = -ENOMEM; /* allocate memory for our device state and initialize it */ dev = kzalloc_obj(*dev); if (!dev) { dev_err(&interface->dev, "Out of memory\n"); goto error; } /* init video transfer queues first of all */ /* to prevent oops in hdpvr_delete() on error paths */ INIT_LIST_HEAD(&dev->free_buff_list); INIT_LIST_HEAD(&dev->rec_buff_list); /* register v4l2_device early so it can be used for printks */ if (v4l2_device_register(&interface->dev, &dev->v4l2_dev)) { dev_err(&interface->dev, "v4l2_device_register failed\n"); goto error_free_dev; } mutex_init(&dev->io_mutex); mutex_init(&dev->i2c_mutex); mutex_init(&dev->usbc_mutex); dev->usbc_buf = kmalloc(64, GFP_KERNEL); if (!dev->usbc_buf) { v4l2_err(&dev->v4l2_dev, "Out of memory\n"); goto error_v4l2_unregister; } init_waitqueue_head(&dev->wait_buffer); init_waitqueue_head(&dev->wait_data); dev->options = hdpvr_default_options; if (default_video_input < HDPVR_VIDEO_INPUTS) dev->options.video_input = default_video_input; if (default_audio_input < HDPVR_AUDIO_INPUTS) { dev->options.audio_input = default_audio_input; if (default_audio_input == HDPVR_SPDIF) dev->options.audio_codec = V4L2_MPEG_AUDIO_ENCODING_AC3; } dev->udev = usb_get_dev(interface_to_usbdev(interface)); /* set up the endpoint information */ /* use only the first bulk-in and bulk-out endpoints */ iface_desc = interface->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (!dev->bulk_in_endpointAddr && usb_endpoint_is_bulk_in(endpoint)) { /* USB interface description is buggy, reported max * packet size is 512 bytes, windows driver uses 8192 */ buffer_size = 8192; dev->bulk_in_size = buffer_size; dev->bulk_in_endpointAddr = endpoint->bEndpointAddress; } } if (!dev->bulk_in_endpointAddr) { v4l2_err(&dev->v4l2_dev, "Could not find bulk-in endpoint\n"); goto error_put_usb; } /* init the device */ if (hdpvr_device_init(dev)) { v4l2_err(&dev->v4l2_dev, "device init failed\n"); goto error_put_usb; } mutex_lock(&dev->io_mutex); if (hdpvr_alloc_buffers(dev, NUM_BUFFERS)) { mutex_unlock(&dev->io_mutex); v4l2_err(&dev->v4l2_dev, "allocating transfer buffers failed\n"); goto error_put_usb; } mutex_unlock(&dev->io_mutex); #if IS_ENABLED(CONFIG_I2C) retval = hdpvr_register_i2c_adapter(dev); if (retval < 0) { v4l2_err(&dev->v4l2_dev, "i2c adapter register failed\n"); goto error_free_buffers; } client = hdpvr_register_ir_i2c(dev); if (IS_ERR(client)) { v4l2_err(&dev->v4l2_dev, "i2c IR device register failed\n"); retval = PTR_ERR(client); goto reg_fail; } #endif dev_num = atomic_inc_return(&dev_nr); if (dev_num >= HDPVR_MAX) { v4l2_err(&dev->v4l2_dev, "max device number reached, device register failed\n"); atomic_dec(&dev_nr); retval = -ENODEV; goto reg_fail; } retval = hdpvr_register_videodev(dev, &interface->dev, video_nr[dev_num]); if (retval < 0) { v4l2_err(&dev->v4l2_dev, "registering videodev failed\n"); goto reg_fail; } /* let the user know what node this device is now attached to */ v4l2_info(&dev->v4l2_dev, "device now attached to %s\n", video_device_node_name(&dev->video_dev)); return 0; reg_fail: #if IS_ENABLED(CONFIG_I2C) i2c_del_adapter(&dev->i2c_adapter); error_free_buffers: #endif hdpvr_free_buffers(dev); error_put_usb: usb_put_dev(dev->udev); kfree(dev->usbc_buf); error_v4l2_unregister: v4l2_device_unregister(&dev->v4l2_dev); error_free_dev: kfree(dev); error: return retval; } static void hdpvr_disconnect(struct usb_interface *interface) { struct hdpvr_device *dev = to_hdpvr_dev(usb_get_intfdata(interface)); v4l2_info(&dev->v4l2_dev, "device %s disconnected\n", video_device_node_name(&dev->video_dev)); /* prevent more I/O from starting and stop any ongoing */ mutex_lock(&dev->io_mutex); dev->status = STATUS_DISCONNECTED; wake_up_interruptible(&dev->wait_data); wake_up_interruptible(&dev->wait_buffer); mutex_unlock(&dev->io_mutex); v4l2_device_disconnect(&dev->v4l2_dev); msleep(100); flush_work(&dev->worker); mutex_lock(&dev->io_mutex); hdpvr_cancel_queue(dev); mutex_unlock(&dev->io_mutex); #if IS_ENABLED(CONFIG_I2C) i2c_del_adapter(&dev->i2c_adapter); #endif video_unregister_device(&dev->video_dev); atomic_dec(&dev_nr); } static struct usb_driver hdpvr_usb_driver = { .name = "hdpvr", .probe = hdpvr_probe, .disconnect = hdpvr_disconnect, .id_table = hdpvr_table, }; module_usb_driver(hdpvr_usb_driver); MODULE_LICENSE("GPL"); MODULE_VERSION("0.2.1"); MODULE_AUTHOR("Janne Grunau"); MODULE_DESCRIPTION("Hauppauge HD PVR driver");
78 32 75 94 6 13 4 169 1 158 158 12 9 1 7 2 23 8 1 1 1 2 1 1 1 1 3 6 1 6 4 2 2 1 1 2 2 2 2 2 2 169 23 17 23 21 6 6 6 6 169 169 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 // SPDX-License-Identifier: GPL-2.0-or-later /************************************************************ * EFI GUID Partition Table handling * * http://www.uefi.org/specs/ * http://www.intel.com/technology/efi/ * * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> * Copyright 2000,2001,2002,2004 Dell Inc. * * TODO: * * Changelog: * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com> * - detect hybrid MBRs, tighter pMBR checking & cleanups. * * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. * - check for first/last_usable_lba outside of size of disk * * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.7-pre1 and 2.5.7-dj2 * - Applied patch to avoid fault in alternate header handling * - cleaned up find_valid_gpt * - On-disk structure and copy in memory is *always* LE now - * swab fields as needed * - remove print_gpt_header() * - only use first max_p partition entries, to keep the kernel minor number * and partition numbers tied. * * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> * - Removed __PRIPTR_PREFIX - not being used * * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied * * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> * - Added compare_gpts(). * - moved le_efi_guid_to_cpus() back into this file. GPT is the only * thing that keeps EFI GUIDs on disk. * - Changed gpt structure names and members to be simpler and more Linux-like. * * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck * * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> * - Changed function comments to DocBook style per Andreas Dilger suggestion. * * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> * - Change read_lba() to use the page cache per Al Viro's work. * - print u64s properly on all architectures * - fixed debug_printk(), now Dprintk() * * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> * - Style cleanups * - made most functions static * - Endianness addition * - remove test for second alternate header, as it's not per spec, * and is unnecessary. There's now a method to read/write the last * sector of an odd-sized disk from user space. No tools have ever * been released which used this code, so it's effectively dead. * - Per Asit Mallick of Intel, added a test for a valid PMBR. * - Added kernel command line option 'gpt' to override valid PMBR test. * * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> * - added devfs volume UUID support (/dev/volumes/uuids) for * mounting file systems by the partition GUID. * * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> * - Moved crc32() to linux/lib, added efi_crc32(). * * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> * - Replaced Intel's CRC32 function with an equivalent * non-license-restricted version. * * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> * - Fixed the last_lba() call to return the proper last block * * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> * - Thanks to Andries Brouwer for his debugging assistance. * - Code works, detects all the partitions. * ************************************************************/ #include <linux/kernel.h> #include <linux/crc32.h> #include <linux/ctype.h> #include <linux/math64.h> #include <linux/slab.h> #include "check.h" #include "efi.h" /* This allows a kernel command line option 'gpt' to override * the test for invalid PMBR. Not __initdata because reloading * the partition tables happens after init too. */ static int force_gpt; static int __init force_gpt_fn(char *str) { force_gpt = 1; return 1; } __setup("gpt", force_gpt_fn); /** * efi_crc32() - EFI version of crc32 function * @buf: buffer to calculate crc32 of * @len: length of buf * * Description: Returns EFI-style CRC32 value for @buf * * This function uses the little endian Ethernet polynomial * but seeds the function with ~0, and xor's with ~0 at the end. * Note, the EFI Specification, v1.02, has a reference to * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). */ static inline u32 efi_crc32(const void *buf, unsigned long len) { return (crc32(~0L, buf, len) ^ ~0L); } /** * last_lba(): return number of last logical block of device * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ static u64 last_lba(struct gendisk *disk) { return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) { if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) goto invalid; /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; return GPT_MBR_PROTECTIVE; invalid: return 0; } /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure * @total_sectors: amount of sectors in the device * * Description: Checks for a valid protective or hybrid * master boot record (MBR). The validity of a pMBR depends * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found * * In addition, a hybrid MBR will have up to three additional * primary partitions, which point to the same space that's * marked out by up to three GPT partitions. * * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or * GPT_MBR_HYBRID depending on the device layout. */ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { uint32_t sz = 0; int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) goto done; for (i = 0; i < 4; i++) { ret = pmbr_part_valid(&mbr->partition_record[i]); if (ret == GPT_MBR_PROTECTIVE) { part = i; /* * Ok, we at least know that there's a protective MBR, * now check if there are other partition types for * hybrid MBR. */ goto check_hybrid; } } if (ret != GPT_MBR_PROTECTIVE) goto done; check_hybrid: for (i = 0; i < 4; i++) if ((mbr->partition_record[i].os_type != EFI_PMBR_OSTYPE_EFI_GPT) && (mbr->partition_record[i].os_type != 0x00)) ret = GPT_MBR_HYBRID; /* * Protective MBRs take up the lesser of the whole disk * or 2 TiB (32bit LBA), ignoring the rest of the disk. * Some partitioning programs, nonetheless, choose to set * the size to the maximum 32-bit limitation, disregarding * the disk size. * * Hybrid MBRs do not necessarily comply with this. * * Consider a bad value here to be a warning to support dd'ing * an image from a smaller disk to a larger disk. */ if (ret == GPT_MBR_PROTECTIVE) { sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", sz, (uint32_t)min(total_sectors - 1, 0xFFFFFFFF)); } done: return ret; } /** * read_lba(): Read bytes from disk, starting at given LBA * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * @buffer: destination buffer * @count: bytes to read * * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; sector_t n = lba * (queue_logical_block_size(state->disk->queue) / 512); if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { int copied = 512; Sector sect; unsigned char *data = read_part_sector(state, n++, &sect); if (!data) break; if (copied > count) copied = count; memcpy(buffer, data, copied); put_dev_sector(sect); buffer += copied; totalreadcount +=copied; count -= copied; } return totalreadcount; } /** * alloc_read_gpt_entries(): reads partition entries from disk * @state: disk parsed partitions * @gpt: GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, gpt_header *gpt) { size_t count; gpt_entry *pte; if (!gpt) return NULL; count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; pte = kmalloc(count, GFP_KERNEL); if (!pte) return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; } return pte; } /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk * @state: disk parsed partitions * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; } return gpt; } /** * is_gpt_valid() - tests one GPT header and PTEs for validity * @state: disk parsed partitions * @lba: logical block address of the GPT header to test * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba, pt_size; if (!ptes) return 0; if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { pr_debug("GUID Partition Table Header signature is wrong:" "%lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->signature), (unsigned long long)GPT_HEADER_SIGNATURE); goto fail; } /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), queue_logical_block_size(state->disk->queue)); goto fail; } /* Check the GUID Partition Table header size is too small */ if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) { pr_debug("GUID Partition Table Header size is too small: %u < %zu\n", le32_to_cpu((*gpt)->header_size), sizeof(gpt_header)); goto fail; } /* Check the GUID Partition Table CRC */ origcrc = le32_to_cpu((*gpt)->header_crc32); (*gpt)->header_crc32 = 0; crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); if (crc != origcrc) { pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", crc, origcrc); goto fail; } (*gpt)->header_crc32 = cpu_to_le32(origcrc); /* Check that the my_lba entry points to the LBA that contains * the GUID Partition Table */ if (le64_to_cpu((*gpt)->my_lba) != lba) { pr_debug("GPT my_lba incorrect: %lld != %lld\n", (unsigned long long)le64_to_cpu((*gpt)->my_lba), (unsigned long long)lba); goto fail; } /* Check the first_usable_lba and last_usable_lba are * within the disk. */ lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); goto fail; } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partition Entry Size check failed.\n"); goto fail; } /* Sanity check partition table size */ pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * le32_to_cpu((*gpt)->sizeof_partition_entry); if (pt_size > KMALLOC_MAX_SIZE) { pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", (unsigned long long)pt_size, KMALLOC_MAX_SIZE); goto fail; } if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); goto fail_ptes; } /* We're done, all's well */ return 1; fail_ptes: kfree(*ptes); *ptes = NULL; fail: kfree(*gpt); *gpt = NULL; return 0; } /** * is_pte_valid() - tests one PTE for validity * @pte:pte to check * @lastlba: last lba of the disk * * Description: returns 1 if valid, 0 on error. */ static inline int is_pte_valid(const gpt_entry *pte, const u64 lastlba) { if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || le64_to_cpu(pte->starting_lba) > lastlba || le64_to_cpu(pte->ending_lba) > lastlba) return 0; return 1; } /** * compare_gpts() - Search disk for valid GPT headers and PTEs * @pgpt: primary GPT header * @agpt: alternate GPT header * @lastlba: last LBA number * * Description: Returns nothing. Sanity checks pgpt and agpt fields * and prints warnings on discrepancies. * */ static void compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) { int error_found = 0; if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { pr_warn("GPT:first_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { pr_warn("GPT:last_usable_lbas don't match.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); error_found++; } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); error_found++; } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs * @state: disk parsed partitions * @gpt: GPT header ptr, filled on return. * @ptes: PTEs ptr, filled on return. * * Description: Returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. * Validity depends on PMBR being valid (or being overridden by the * 'gpt' kernel command line option) and finding either the Primary * GPT header and PTEs valid, or the Alternate GPT header and PTEs * valid. If the Primary GPT header is not valid, the Alternate GPT header * is not checked unless the 'gpt' kernel command line option is passed. * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; struct gendisk *disk = state->disk; const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc_obj(*legacymbr); if (!legacymbr) goto fail; read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr, total_sectors); kfree(legacymbr); if (!good_pmbr) goto fail; pr_debug("Device has a %s MBR\n", good_pmbr == GPT_MBR_PROTECTIVE ? "protective" : "hybrid"); } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { sector_t agpt_sector; int err; err = fops->alternative_gpt_sector(disk, &agpt_sector); if (!err) good_agpt = is_gpt_valid(state, agpt_sector, &agpt, &aptes); } /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; compare_gpts(pgpt, agpt, lastlba); /* The good cases */ if (good_pgpt) { *gpt = pgpt; *ptes = pptes; kfree(agpt); kfree(aptes); if (!good_agpt) pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { *gpt = agpt; *ptes = aptes; kfree(pgpt); kfree(pptes); pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } fail: kfree(pgpt); kfree(agpt); kfree(pptes); kfree(aptes); *gpt = NULL; *ptes = NULL; return 0; } /** * utf16_le_to_7bit(): Naively converts a UTF-16LE string to 7-bit ASCII characters * @in: input UTF-16LE string * @size: size of the input string * @out: output string ptr, should be capable to store @size+1 characters * * Description: Converts @size UTF16-LE symbols from @in string to 7-bit * ASCII characters and stores them to @out. Adds trailing zero to @out array. */ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) { unsigned int i = 0; out[size] = 0; while (i < size) { u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; out[i] = c; i++; } } /** * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. * * If the first block on the disk is a legacy MBR, * it will get handled by msdos_partition(). * If it's a Protective MBR, we'll handle it here. * * We do not create a Linux partition for GPT, but * only for the actual data partitions. * Returns: * -1 if unable to read the partition table * 0 if this isn't our partition table * 1 if successful * */ int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; } pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { struct partition_meta_info *info; unsigned label_max; u64 start = le64_to_cpu(ptes[i].starting_lba); u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ label_max = min(ARRAY_SIZE(info->volname) - 1, ARRAY_SIZE(ptes[i].partition_name)); utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); state->parts[i + 1].has_info = true; } kfree(ptes); kfree(gpt); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
44 30 30 30 6 6 9 33 2 25 25 25 16 16 127 127 16 16 6 6 796 793 2 32 25 1 7 1 1 790 704 619 1 3 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 // SPDX-License-Identifier: GPL-2.0 /* * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * * Written by Uday Savagaonkar, 2014. * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ #include <crypto/sha2.h> #include <crypto/skcipher.h> #include <linux/export.h> #include <linux/namei.h> #include <linux/scatterlist.h> #include <linux/base64.h> #include "fscrypt_private.h" /* * The minimum message length (input and output length), in bytes, for all * filenames encryption modes. Filenames shorter than this will be zero-padded * before being encrypted. */ #define FSCRYPT_FNAME_MIN_MSG_LEN 16 /* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the * filesystem must present a unique "no-key name" for each filename that allows * it to find the directory entry again if requested. Naively, that would just * mean using the ciphertext filenames. However, since the ciphertext filenames * can contain illegal characters ('\0' and '/'), they must be encoded in some * way. We use base64url. But that can cause names to exceed NAME_MAX (255 * bytes), so we also need to use a strong hash to abbreviate long names. * * The filesystem may also need another kind of hash, the "dirhash", to quickly * find the directory entry. Since filesystems normally compute the dirhash * over the on-disk filename (i.e. the ciphertext), it's not computable from * no-key names that abbreviate the ciphertext using the strong hash to fit in * NAME_MAX. It's also not computable if it's a keyed hash taken over the * plaintext (but it may still be available in the on-disk directory entry); * casefolded directories use this type of dirhash. At least in these cases, * each no-key name must include the name's dirhash too. * * To meet all these requirements, we base64url-encode the following * variable-length structure. It contains the dirhash, or 0's if the filesystem * didn't provide one; up to 149 bytes of the ciphertext name; and for * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. * * This ensures that each no-key name contains everything needed to find the * directory entry again, contains only legal characters, doesn't exceed * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only * take the performance hit of SHA-256 on very long filenames (which are rare). */ struct fscrypt_nokey_name { u32 dirhash[2]; u8 bytes[149]; u8 sha256[SHA256_DIGEST_SIZE]; }; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */ /* * Decoded size of max-size no-key name, i.e. a name that was abbreviated using * the strong hash and thus includes the 'sha256' field. This isn't simply * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) /* Encoded size of max-size no-key name */ #define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { return name_is_dot_dotdot(str->name, str->len); } /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets). Key must already be * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; struct scatterlist sg; int err; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ if (WARN_ON_ONCE(olen < iname->len)) return -ENOBUFS; memcpy(out, iname->name, iname->len); memset(out + iname->len, 0, olen - iname->len); fscrypt_generate_iv(&iv, 0, ci); skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); sg_init_one(&sg, out, olen); skcipher_request_set_crypt(req, &sg, &sg, olen, &iv); err = crypto_skcipher_encrypt(req); if (err) fscrypt_err(inode, "Filename encryption failed: %d", err); return err; } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @iname: the encrypted filename to decrypt * @oname: (output) the decrypted filename. The caller must have allocated * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; struct scatterlist src_sg, dst_sg; int err; fscrypt_generate_iv(&iv, 0, ci); skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv); err = crypto_skcipher_decrypt(req); if (err) { fscrypt_err(inode, "Filename decryption failed: %d", err); return err; } oname->len = strnlen(oname->name, iname->len); return 0; } bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; if (orig_len > max_len) return false; encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; } /** * fscrypt_fname_encrypted_size() - calculate length of encrypted filename * @inode: parent inode of dentry name being encrypted. Key must * already be set up. * @orig_len: length of the original filename * @max_len: maximum length to return * @encrypted_len_ret: where calculated length should be returned (on success) * * Filenames that are shorter than the maximum length may have their lengths * increased slightly by encryption, due to padding that is applied. * * Return: false if the orig_len is greater than max_len. Otherwise, true and * fill out encrypted_len_ret with the length (up to max_len). */ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED, max_encrypted_len); crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); if (!crypto_str->name) return -ENOMEM; crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** * fscrypt_fname_free_buffer() - free a buffer for presented filenames * @crypto_str: the buffer to free * * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - convert an encrypted filename to * user-presentable form * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @hash: first part of the name's dirhash, if applicable. This only needs to * be provided if the filename is located in an indexed directory whose * encryption key may be unavailable. Not needed for symlink targets. * @minor_hash: second part of the name's dirhash, if applicable * @iname: encrypted filename to convert. May also be "." or "..", which * aren't actually encrypted. * @oname: output buffer for the user-presentable filename. The caller must * have allocated enough space for this, e.g. using * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. * See struct fscrypt_nokey_name for details. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); struct fscrypt_nokey_name nokey_name; u32 size; /* size of the unencoded no-key name */ if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return 0; } if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) return fname_decrypt(inode, iname, oname); /* * Sanity check that struct fscrypt_nokey_name doesn't have padding * between fields and that its encoded size never exceeds NAME_MAX. */ BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) != offsetof(struct fscrypt_nokey_name, bytes)); BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX); nokey_name.dirhash[0] = hash; nokey_name.dirhash[1] = minor_hash; if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ sha256(&iname->name[sizeof(nokey_name.bytes)], iname->len - sizeof(nokey_name.bytes), nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name, false, BASE64_URLSAFE); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched * @iname: the user-provided filename being searched for * @lookup: 1 if we're allowed to proceed without the key because it's * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot * proceed without the key because we're going to create the dir_entry. * @fname: the filename information to be filled in * * Given a user-provided filename @iname, this function sets @fname->disk_name * to the name that would be stored in the on-disk directory entry, if possible. * If the directory is unencrypted this is simply @iname. Else, if we have the * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * * Else, for keyless @lookup operations, @iname should be a no-key name, so we * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * * Return: 0 on success, -errno on failure */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { struct fscrypt_nokey_name *nokey_name; int ret; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } ret = fscrypt_get_encryption_info(dir, lookup); if (ret) return ret; if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, GFP_NOFS); if (!fname->crypto_buf.name) return -ENOMEM; ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name, fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; return 0; } if (!lookup) return -ENOKEY; fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED) return -ENOENT; fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name, false, BASE64_URLSAFE); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; nokey_name = (void *)fname->crypto_buf.name; fname->hash = nokey_name->dirhash[0]; fname->minor_hash = nokey_name->dirhash[1]; if (ret != FSCRYPT_NOKEY_NAME_MAX) { /* The full ciphertext filename is available. */ fname->disk_name.name = nokey_name->bytes; fname->disk_name.len = ret - offsetof(struct fscrypt_nokey_name, bytes); } return 0; errout: kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); /** * fscrypt_match_name() - test whether the given name matches a directory entry * @fname: the name being searched for * @de_name: the name from the directory entry * @de_name_len: the length of @de_name in bytes * * Normally @fname->disk_name will be set, and in that case we simply compare * that to the name stored in the directory entry. The only exception is that * if we don't have the key for an encrypted directory and the name we're * looking for is very long, then we won't have the full disk_name and instead * we'll need to match against a fscrypt_nokey_name that includes a strong hash. * * Return: %true if the name matches, otherwise %false. */ bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, de_name_len); } if (de_name_len <= sizeof(nokey_name->bytes)) return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; sha256(&de_name[sizeof(nokey_name->bytes)], de_name_len - sizeof(nokey_name->bytes), digest); return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); /** * fscrypt_fname_siphash() - calculate the SipHash of a filename * @dir: the parent directory * @name: the filename to calculate the SipHash of * * Given a plaintext filename @name and a directory @dir which uses SipHash as * its dirhash method and has had its fscrypt key set up, this function * calculates the SipHash of that name using the directory's secret dirhash key. * * Return: the SipHash of @name using the hash key of @dir */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); return siphash(name->name, name->len, &ci->ci_dirhash_key); } EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); /* * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { int err; /* * Plaintext names are always valid, since fscrypt doesn't support * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* * No-key name; valid if the directory's key is still unavailable. * * Note in RCU mode we have to bail if we get here - * fscrypt_get_encryption_info() may block. */ if (flags & LOOKUP_RCU) return -ECHILD; /* * Pass allow_unsupported=true, so that files with an unsupported * encryption policy can be deleted. */ err = fscrypt_get_encryption_info(dir, true); if (err < 0) return err; return !fscrypt_has_encryption_key(dir); } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
52 10 16 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 #ifndef LLC_PDU_H #define LLC_PDU_H /* * Copyright (c) 1997 by Procom Technology,Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if_ether.h> /* Lengths of frame formats */ #define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ #define LLC_PDU_LEN_S 4 #define LLC_PDU_LEN_U 3 /* header and 1 control byte */ /* header and 1 control byte and XID info */ #define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ #define LLC_MGMT_INDIV 0x02 /* station LLC mgmt indiv addr */ #define LLC_MGMT_GRP 0x03 /* station LLC mgmt group addr */ #define LLC_RDE_SAP 0xA6 /* route ... */ /* SAP field bit masks */ #define LLC_ISO_RESERVED_SAP 0x02 #define LLC_SAP_GROUP_DSAP 0x01 #define LLC_SAP_RESP_SSAP 0x01 /* Group/individual DSAP indicator is DSAP field */ #define LLC_PDU_GROUP_DSAP_MASK 0x01 #define LLC_PDU_IS_GROUP_DSAP(pdu) \ ((pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) #define LLC_PDU_IS_INDIV_DSAP(pdu) \ (!(pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) /* Command/response PDU indicator in SSAP field */ #define LLC_PDU_CMD_RSP_MASK 0x01 #define LLC_PDU_CMD 0 #define LLC_PDU_RSP 1 #define LLC_PDU_IS_CMD(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 0 : 1) #define LLC_PDU_IS_RSP(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 1 : 0) /* Get PDU type from 2 lowest-order bits of control field first byte */ #define LLC_PDU_TYPE_I_MASK 0x01 /* 16-bit control field */ #define LLC_PDU_TYPE_S_MASK 0x03 #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 #define LLC_PDU_TYPE_I 0 /* first bit */ #define LLC_PDU_TYPE_S 1 /* first two bits */ #define LLC_PDU_TYPE_U 3 /* first two bits */ #define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) #define LLC_PDU_TYPE_IS_U(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_U_MASK) == LLC_PDU_TYPE_U) ? 1 : 0) #define LLC_PDU_TYPE_IS_S(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_S_MASK) == LLC_PDU_TYPE_S) ? 1 : 0) /* U-format PDU control field masks */ #define LLC_U_PF_BIT_MASK 0x10 /* P/F bit mask */ #define LLC_U_PF_IS_1(pdu) ((pdu->ctrl_1 & LLC_U_PF_BIT_MASK) ? 1 : 0) #define LLC_U_PF_IS_0(pdu) ((!(pdu->ctrl_1 & LLC_U_PF_BIT_MASK)) ? 1 : 0) #define LLC_U_PDU_CMD_MASK 0xEC /* cmd/rsp mask */ #define LLC_U_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_U_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_1_PDU_CMD_UI 0x00 /* Type 1 cmds/rsps */ #define LLC_1_PDU_CMD_XID 0xAC #define LLC_1_PDU_CMD_TEST 0xE0 #define LLC_2_PDU_CMD_SABME 0x6C /* Type 2 cmds/rsps */ #define LLC_2_PDU_CMD_DISC 0x40 #define LLC_2_PDU_RSP_UA 0x60 #define LLC_2_PDU_RSP_DM 0x0C #define LLC_2_PDU_RSP_FRMR 0x84 /* Type 1 operations */ /* XID information field bit masks */ /* LLC format identifier (byte 1) */ #define LLC_XID_FMT_ID 0x81 /* first byte must be this */ /* LLC types/classes identifier (byte 2) */ #define LLC_XID_CLASS_ZEROS_MASK 0xE0 /* these must be zeros */ #define LLC_XID_CLASS_MASK 0x1F /* AND with byte to get below */ #define LLC_XID_NULL_CLASS_1 0x01 /* if NULL LSAP...use these */ #define LLC_XID_NULL_CLASS_2 0x03 #define LLC_XID_NULL_CLASS_3 0x05 #define LLC_XID_NULL_CLASS_4 0x07 #define LLC_XID_NNULL_TYPE_1 0x01 /* if non-NULL LSAP...use these */ #define LLC_XID_NNULL_TYPE_2 0x02 #define LLC_XID_NNULL_TYPE_3 0x04 #define LLC_XID_NNULL_TYPE_1_2 0x03 #define LLC_XID_NNULL_TYPE_1_3 0x05 #define LLC_XID_NNULL_TYPE_2_3 0x06 #define LLC_XID_NNULL_ALL 0x07 /* Sender Receive Window (byte 3) */ #define LLC_XID_RW_MASK 0xFE /* AND with value to get below */ #define LLC_XID_MIN_RW 0x02 /* lowest-order bit always zero */ /* Type 2 operations */ #define LLC_2_SEQ_NBR_MODULO ((u8) 128) /* I-PDU masks ('ctrl' is I-PDU control word) */ #define LLC_I_GET_NS(pdu) (u8)((pdu->ctrl_1 & 0xFE) >> 1) #define LLC_I_GET_NR(pdu) (u8)((pdu->ctrl_2 & 0xFE) >> 1) #define LLC_I_PF_BIT_MASK 0x01 #define LLC_I_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_I_PF_BIT_MASK)) ? 1 : 0) #define LLC_I_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_I_PF_BIT_MASK) ? 1 : 0) /* S-PDU supervisory commands and responses */ #define LLC_S_PDU_CMD_MASK 0x0C #define LLC_S_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_S_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_2_PDU_CMD_RR 0x00 /* rx ready cmd */ #define LLC_2_PDU_RSP_RR 0x00 /* rx ready rsp */ #define LLC_2_PDU_CMD_REJ 0x08 /* reject PDU cmd */ #define LLC_2_PDU_RSP_REJ 0x08 /* reject PDU rsp */ #define LLC_2_PDU_CMD_RNR 0x04 /* rx not ready cmd */ #define LLC_2_PDU_RSP_RNR 0x04 /* rx not ready rsp */ #define LLC_S_PF_BIT_MASK 0x01 #define LLC_S_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_S_PF_BIT_MASK)) ? 1 : 0) #define LLC_S_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_S_PF_BIT_MASK) ? 1 : 0) #define PDU_SUPV_GET_Nr(pdu) ((pdu->ctrl_2 & 0xFE) >> 1) #define PDU_GET_NEXT_Vr(sn) (((sn) + 1) & ~LLC_2_SEQ_NBR_MODULO) /* FRMR information field macros */ #define FRMR_INFO_LENGTH 5 /* 5 bytes of information */ /* * info is pointer to FRMR info field structure; 'rej_ctrl' is byte pointer * (if U-PDU) or word pointer to rejected PDU control field */ #define FRMR_INFO_SET_REJ_CNTRL(info,rej_ctrl) \ info->rej_pdu_ctrl = ((*((u8 *) rej_ctrl) & \ LLC_PDU_TYPE_U) != LLC_PDU_TYPE_U ? \ (u16)*((u16 *) rej_ctrl) : \ (((u16) *((u8 *) rej_ctrl)) & 0x00FF)) /* * Info is pointer to FRMR info field structure; 'vs' is a byte containing * send state variable value in low-order 7 bits (insure the lowest-order * bit remains zero (0)) */ #define FRMR_INFO_SET_Vs(info,vs) (info->curr_ssv = (((u8) vs) << 1)) #define FRMR_INFO_SET_Vr(info,vr) (info->curr_rsv = (((u8) vr) << 1)) /* * Info is pointer to FRMR info field structure; 'cr' is a byte containing * the C/R bit value in the low-order bit */ #define FRMR_INFO_SET_C_R_BIT(info, cr) (info->curr_rsv |= (((u8) cr) & 0x01)) /* * In the remaining five macros, 'info' is pointer to FRMR info field * structure; 'ind' is a byte containing the bit value to set in the * lowest-order bit) */ #define FRMR_INFO_SET_INVALID_PDU_CTRL_IND(info, ind) \ (info->ind_bits = ((info->ind_bits & 0xFE) | (((u8) ind) & 0x01))) #define FRMR_INFO_SET_INVALID_PDU_INFO_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFD) | (((u8) ind) & 0x02))) #define FRMR_INFO_SET_PDU_INFO_2LONG_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFB) | (((u8) ind) & 0x04))) #define FRMR_INFO_SET_PDU_INVALID_Nr_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xF7) | (((u8) ind) & 0x08))) #define FRMR_INFO_SET_PDU_INVALID_Ns_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xEF) | (((u8) ind) & 0x10))) /* Sequence-numbered PDU format (4 bytes in length) */ struct llc_pdu_sn { u8 dsap; u8 ssap; u8 ctrl_1; u8 ctrl_2; } __packed; static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb) { return (struct llc_pdu_sn *)skb_network_header(skb); } /* Un-numbered PDU format (3 bytes in length) */ struct llc_pdu_un { u8 dsap; u8 ssap; u8 ctrl_1; } __packed; static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) { return (struct llc_pdu_un *)skb_network_header(skb); } /** * llc_pdu_header_init - initializes pdu header * @skb: input skb that header must be set into it. * @type: type of PDU (U, I or S). * @ssap: source sap. * @dsap: destination sap. * @cr: command/response bit (0 or 1). * * This function sets DSAP, SSAP and command/Response bit in LLC header. */ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; switch (type) { case LLC_PDU_TYPE_U: hlen = 3; break; case LLC_PDU_TYPE_U_XID: hlen = 6; break; } skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); pdu->dsap = dsap; pdu->ssap = ssap; pdu->ssap |= cr; } /** * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * * This function extracts source address(MAC) of input frame. */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** * llc_pdu_decode_ssap - extracts source SAP of input frame * @skb: input skb that source SAP must be extracted from it. * @ssap: source SAP (output argument). * * This function extracts source SAP of input frame. Right bit of SSAP is * command/response bit. */ static inline void llc_pdu_decode_ssap(struct sk_buff *skb, u8 *ssap) { *ssap = llc_pdu_un_hdr(skb)->ssap & 0xFE; } /** * llc_pdu_decode_dsap - extracts dest SAP of input frame * @skb: input skb that destination SAP must be extracted from it. * @dsap: destination SAP (output argument). * * This function extracts destination SAP of input frame. right bit of * DSAP designates individual/group SAP. */ static inline void llc_pdu_decode_dsap(struct sk_buff *skb, u8 *dsap) { *dsap = llc_pdu_un_hdr(skb)->dsap & 0xFE; } /** * llc_pdu_init_as_ui_cmd - sets LLC header as UI PDU * @skb: input skb that header must be set into it. * * This function sets third byte of LLC header as a UI PDU. */ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_UI; } /** * llc_pdu_init_as_test_cmd - sets PDU as TEST * @skb: Address of the skb to build * * Sets a PDU as TEST */ static inline void llc_pdu_init_as_test_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_test_rsp - build TEST response PDU * @skb: Address of the skb to build * @ev_skb: The received TEST command PDU frame * * Builds a pdu frame as a TEST response. */ static inline void llc_pdu_init_as_test_rsp(struct sk_buff *skb, struct sk_buff *ev_skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; if (ev_skb->protocol == htons(ETH_P_802_2)) { struct llc_pdu_un *ev_pdu = llc_pdu_un_hdr(ev_skb); int dsize; dsize = ntohs(eth_hdr(ev_skb)->h_proto) - 3; memcpy(((u8 *)pdu) + 3, ((u8 *)ev_pdu) + 3, dsize); skb_put(skb, dsize); } } /* LLC Type 1 XID command/response information fields format */ struct llc_xid_info { u8 fmt_id; /* always 0x81 for LLC */ u8 type; /* different if NULL/non-NULL LSAP */ u8 rw; /* sender receive window */ } __packed; /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. */ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ /* no need to push/put since llc_pdu_header_init() has already * pushed 3 + 3 bytes */ } /** * llc_pdu_init_as_xid_rsp - builds XID response PDU * @skb: Address of the skb to build * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * Builds a pdu frame as an XID response. */ static inline void llc_pdu_init_as_xid_rsp(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; skb_put(skb, sizeof(struct llc_xid_info)); } /* LLC Type 2 FRMR response information field format */ struct llc_frmr_info { u16 rej_pdu_ctrl; /* bits 1-8 if U-PDU */ u8 curr_ssv; /* current send state variable val */ u8 curr_rsv; /* current receive state variable */ u8 ind_bits; /* indicator bits set with macro */ } __packed; void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type); void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value); void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit); void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr); void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit); void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw); void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit); #endif /* LLC_PDU_H */
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * linux/fs/pnode.h * * (C) Copyright IBM Corporation 2005. */ #ifndef _LINUX_PNODE_H #define _LINUX_PNODE_H #include <linux/list.h> #include "mount.h" #define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED) #define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 #define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_COPY_MNT_NS_FILE 0x40 /* * EXCL[namespace_sem] */ static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt_t_flags &= ~T_SHARED_MASK; mnt->mnt_t_flags |= T_SHARED; } static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } void change_mnt_propagation(struct mount *, int); void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */
1 1 1 1 617 622 508 1 107 614 1 613 3 1 619 619 621 622 618 614 648 615 34 141 66 72 3 3 756 760 756 13 757 37 400 165 153 761 750 20 20 724 583 141 651 140 159 200 156 156 86 707 706 134 611 821 24 21 785 802 7 789 761 1 2 759 1 756 29 740 43 708 57 57 48 48 11 16 30 57 57 57 22 48 12 48 48 15 48 57 57 57 87 86 87 32 31 57 57 32 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> #include <net/udp.h> #include <net/tcp.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); return 0; } skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline enum skb_drop_reason ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph; struct ip_options *opt; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); return SKB_DROP_REASON_NOMEM; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); return SKB_DROP_REASON_IP_INHDR; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); return SKB_DROP_REASON_NOT_SPECIFIED; } } if (ip_options_rcv_srr(skb, dev)) return SKB_DROP_REASON_NOT_SPECIFIED; } return SKB_NOT_DROPPED_YET; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; int drop_reason; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { drop_reason = udp_v4_early_demux(skb); if (unlikely(drop_reason)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5) { drop_reason = ip_rcv_options(skb, dev); if (drop_reason) goto drop; } rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); if (fib4_has_custom_rules(net) || ipv4_is_lbcast(iph->daddr) || ipv4_is_zeronet(iph->daddr) || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); }
6 1 2 3 6 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0-only /* Masquerade. Simple mapping which alters range to a local IP address (depending on route). */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_masquerade.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); /* FIXME: Multiple targets. --RR */ static int masquerade_tg_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; mr = par->targinfo; range.flags = mr->range[0].flags; range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, xt_out(par)); } static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } #if IS_ENABLED(CONFIG_IPV6) static unsigned int masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) { return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); } static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } #endif static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg6_checkentry, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }, { #endif .name = "MASQUERADE", .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, } }; static int __init masquerade_tg_init(void) { int ret; ret = xt_register_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); if (ret) return ret; ret = nf_nat_masquerade_inet_register_notifiers(); if (ret) { xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); return ret; } return ret; } static void __exit masquerade_tg_exit(void) { xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); nf_nat_masquerade_inet_unregister_notifiers(); } module_init(masquerade_tg_init); module_exit(masquerade_tg_exit); #if IS_ENABLED(CONFIG_IPV6) MODULE_ALIAS("ip6t_MASQUERADE"); #endif MODULE_ALIAS("ipt_MASQUERADE");
1 2 1 3 3 38 9 57 53 6 57 57 1701 1702 85 85 85 29 68 1753 88 1704 69 8 8 8 8 51 52 9 9 3 8 8 8 9 54 54 54 36 2 54 54 53 4 52 3 3 8 54 52 3 54 51 51 51 35 5 5 51 3 49 51 51 51 3 3 51 51 51 57 61 61 55 61 57 1 57 57 57 30 57 3 57 54 7 7 9 2 5 7 9 61 61 61 60 61 56 56 5 56 52 9 9 54 54 2 54 54 54 54 9 9 51 51 51 54 51 14 56 57 57 56 57 1 37 9 1 54 54 53 4 54 7 1 2 53 56 57 10 56 57 10 10 2 8 7 2 2 1 1 1 1 7 10 10 10 10 11 11 11 1 1 4 4 4 4 2 4 4 4 11 2 10 10 11 4 7 44 69 8 58 63 51 3 50 63 11 11 11 37 61 63 32 37 64 64 21 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> #include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> #include <linux/pgalloc.h> #include <linux/backing-dev.h> #include <asm/tlb.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, SCAN_PAGE_COUNT, SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, SCAN_PAGE_LAZYFREE, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, SCAN_PAGE_DIRTY_OR_WRITEBACK, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped, * no_pte_table or vmas every 10 second. */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static unsigned long khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged. */ unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; /* Num pages scanned (see khugepaged_pages_to_scan) */ unsigned int progress; /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure. */ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t __sleep_millisecs_store(const char *buf, size_t count, unsigned int *millisecs) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); } static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int pages; int err; err = kstrtouint(buf, 10, &pages); if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; return count; } static struct kobj_attribute pages_to_scan_attr = __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR_RW(max_ptes_none); static ssize_t max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t max_ptes_swap_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; return count; } static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR_RW(max_ptes_swap); static ssize_t max_ptes_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t max_ptes_shared_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; return count; } static struct kobj_attribute khugepaged_max_ptes_shared_attr = __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_swap_attr.attr, &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, NULL, }; struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static bool pte_none_or_zero(pte_t pte) { if (pte_none(pte)) return true; return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); } int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon. */ khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set. */ break; } return 0; } int __init khugepaged_init(void) { mm_slot_cache = KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } void __init khugepaged_destroy(void) { kmem_cache_destroy(mm_slot_cache); } static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) { /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_always)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && hugepage_global_enabled()) return true; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) return true; return false; } void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; slot = mm_slot_alloc(mm_slot_cache); if (!slot) return; spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); } void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); } } static void release_pte_folio(struct folio *folio) { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_unlock(folio); folio_putback_lru(folio); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) continue; VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; folio = pfn_folio(pfn); if (folio_test_large(folio)) continue; release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { list_del(&folio->lru); release_pte_folio(folio); } } static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, referenced = 0; enum scan_result result = SCAN_FAIL; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } if (!pte_present(pteval)) { result = SCAN_PTE_NON_PRESENT; goto out; } if (pte_uffd_wp(pteval)) { result = SCAN_PTE_UFFD_WP; goto out; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && folio_test_lazyfree(folio) && !pte_dirty(pteval)) { result = SCAN_PAGE_LAZYFREE; goto out; } /* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } } if (folio_test_large(folio)) { struct folio *f; /* * Check if we have dealt with the compound page * already */ list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next; } } /* * We can do it before folio_isolate_lru because the * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (!folio_isolate_lru(folio)) { folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (folio_test_large(folio)) list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, result); return result; } static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; pte_t pteval; pte_t *_pte; unsigned int nr_ptes; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (pte_none(pteval)) continue; /* * ptl mostly unnecessary. */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); src = page_folio(src_page); if (folio_test_large(src)) { unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); } else { release_pte_folio(src); } /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); clear_ptes(vma->vm_mm, address, _pte, nr_ptes); folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); free_swap_cache(src); folio_put_refs(src, nr_ptes); } } list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { list_del(&src->lru); node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); free_swap_cache(src); folio_putback_lru(src); } } static void __collapse_huge_page_copy_failed(pte_t *pte, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary. */ pmd_ptl = pmd_lock(vma->vm_mm, pmd); pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); } /* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; enum scan_result result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ for (i = 0; i < HPAGE_PMD_NR; i++) { pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } } if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, compound_pagelist); return result; } static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } static struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; /* * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) { max_value = cc->node_load[nid]; target_node = nid; } for_each_online_node(nid) { if (max_value == cc->node_load[nid]) node_set(nid, cc->alloc_nmask); } return target_node; } #else static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. * Returns enum scan_result value. */ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED; } static inline enum scan_result check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } static enum scan_result check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); enum scan_result result; pte_t *pte = NULL; spinlock_t *ptl; for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = addr, .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; if (!pte++) { /* * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); if (pte_none(vmf.orig_pte) || pte_present(vmf.orig_pte)) continue; vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */ pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */ result = SCAN_PAGE_LOCK; goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); result = SCAN_FAIL; goto out; } swapped_in++; } if (pte) pte_unmap(pte); /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); result = SCAN_SUCCEED; out: trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); return result; } static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); *foliop = folio; return SCAN_SUCCEED; } static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address, int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; enum scan_result result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, referenced); if (result != SCAN_SUCCEED) goto out_nolock; } mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ vma_start_write(vma); result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { if (pte) pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); pgtable_trans_huge_deposit(mm, pmd, pgtable); map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (folio) folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; int none_or_zero = 0, shared = 0, referenced = 0; enum scan_result result = SCAN_FAIL; struct page *page = NULL; struct folio *folio = NULL; unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) { cc->progress++; goto out; } memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { cc->progress++; result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { cc->progress++; pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } folio = page_folio(page); if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && folio_test_lazyfree(folio) && !pte_dirty(pteval)) { result = SCAN_PAGE_LAZYFREE; goto out_unmap; } if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * We treat a single page as shared if any part of the THP * is shared. */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } static void collect_mm_slot(struct mm_slot *slot) { struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } /* folio must be locked, and mmap_lock must be held */ static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, }; pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp; mmap_assert_locked(vma->vm_mm); if (!pmdp) { pgdp = pgd_offset(mm, addr); p4dp = p4d_alloc(mm, pgdp, addr); if (!p4dp) return SCAN_FAIL; pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return SCAN_FAIL; pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) return SCAN_FAIL; } vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; folio_get(folio); return SCAN_SUCCEED; } static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { enum scan_result result = SCAN_FAIL; int nr_mapped_ptes = 0; unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; int i; mmap_assert_locked(mm); /* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here and force collapse. */ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL; if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); /* empty pte, skip */ if (pte_none(ptent)) continue; /* page swapped out, abort */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ if (folio_page(folio, i) != page) goto abort; } pte_unmap_unlock(start_pte, ptl); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; /* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) spin_lock(ptl); else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) goto abort; /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, pte += nr_batch_ptes) { unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); nr_batch_ptes = 1; if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (folio_page(folio, i) != page) goto abort; nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ clear_ptes(mm, addr, pte, nr_batch_ptes); folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_mapped_ptes) { folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { flush_tlb_mm(mm); goto unlock; } } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); pte_unmap_unlock(start_pte, ptl); if (ptl != pml) spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, haddr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: if (nr_mapped_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_mapped_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_folio: folio_unlock(folio); folio_put(folio); return result; } /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { try_collapse_pte_mapped_thp(mm, addr, install_pmd); } /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later. */ if (READ_ONCE(vma->anon_vma)) return false; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges. */ if (userfaultfd_wp(vma)) return false; /* * If the VMA contains guard regions then we can't collapse it. * * This is set atomically on guard marker installation under mmap/VMA * read lock, and here we may not hold any VMA or mmap lock at all. * * This is therefore serialised on the PTE page table lock, which is * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { struct mmu_notifier_range range; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; bool success = false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; if (hpage_collapse_test_exit(mm)) continue; if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); /* * The lock of new_folio is still held, we will be blocked in * the page fault path, which prevents the pte entries from * being set again. So even though the old empty PTE page may be * concurrently freed and a new PTE page is filled into the pmd * entry, it is still empty and can be removed. * * So here we only need to recheck if the state of pmd entry * still meets our requirements, rather than checking pmd_same() * like elsewhere. */ if (check_pmd_state(pmd) != SCAN_SUCCEED) goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* * Huge page lock is still held, so normally the page table must * remain empty; and we have already skipped anon_vma and * userfaultfd_wp() vmas. But since the mmap_lock is not held, * it is still possible for a racing userfaultfd_ioctl() or * madvise() to have inserted ptes or markers. Now that we hold * ptlock, repeating the retractable checks protects us from * races against the prior checks. */ if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; } if (ptl != pml) spin_unlock(ptl); drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } i_mmap_unlock_read(mapping); } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache, locking old pages * + swap/gup in pages if necessary; * - copy data to new page * - handle shmem holes * + re-validate that holes weren't filled by someone else * + check for userfaultfd * - finalize updates to the page cache; * - if replacing succeeds: * + unlock huge page; * + free old pages; * - if replacing failed; * + unlock old pages * + unlock and free huge page; */ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); enum scan_result result = SCAN_SUCCEED; int nr_none = 0; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; mapping_set_update(&xas, mapping); __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); new_folio->index = start; new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present */ do { xas_lock_irq(&xas); xas_create_range(&xas); if (!xas_error(&xas)) break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; goto rollback; } } while (1); for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty. */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; goto xa_locked; } } nr_none++; index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop. */ xas_unlock_irq(&xas); filemap_flush(mapping); result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } /* * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); /* make sure the folio is up to date */ if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } if (!is_shmem && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto out_unlock; } if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; } if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* * We control 2 + nr_pages references to the folio: * - we hold a pin on it; * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page * cache. That requires locking the folio to handle * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); goto out_unlock; } /* * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); folio_put(folio); goto xa_unlocked; } if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure i_writecount is up to date and the update to nr_thps * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; filemap_nr_thps_dec(mapping); } } xa_locked: xas_unlock_irq(&xas); xa_unlocked: /* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state. */ try_to_unmap_flush(); if (result == SCAN_SUCCEED && nr_none && !shmem_charge(mapping->host, nr_none)) result = SCAN_FAIL; if (result != SCAN_SUCCEED) { nr_none = 0; goto rollback; } /* * The old folios are locked, so they won't change anymore. */ index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { int i, nr_pages = folio_nr_pages(folio); while (index < folio->index) { clear_highpage(dst); index++; dst++; } for (i = 0; i < nr_pages; i++) { if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; dst++; } } while (index < end) { clear_highpage(dst); index++; dst++; } if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0; i_mmap_lock_read(mapping); xas_lock_irq(&xas); xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) { xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) { result = SCAN_STORE_FAILED; goto immap_locked; } nr_none_check++; } } if (nr_none != nr_none_check) { result = SCAN_PAGE_FILLED; goto immap_locked; } /* * If userspace observed a missing page in a VMA with * a MODE_MISSING userfaultfd, then it might expect a * UFFD_EVENT_PAGEFAULT for that page. If so, we need to * roll back to avoid suppressing such an event. Since * wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing * page with a zero page, so they don't matter here. * * Any userfaultfds registered after this point will * not be able to observe any missing pages due to the * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { result = SCAN_EXCEED_NONE_PTE; goto immap_locked; } } immap_locked: i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) { xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY) xas_store(&xas, NULL); } xas_unlock_irq(&xas); goto rollback; } } else { xas_lock_irq(&xas); } if (is_shmem) { lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR); lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); } else { lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); } lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR); /* * Mark new_folio as uptodate before inserting it into the * page cache so that it isn't mistaken for an fallocated but * unwritten page. */ folio_mark_uptodate(new_folio); folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) folio_mark_dirty(new_folio); folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); /* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; folio_unlock(new_folio); /* * The collapse has succeeded, so free the old folios. */ list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -folio_nr_pages(folio)); if (is_shmem) lruvec_stat_mod_folio(folio, NR_SHMEM, -folio_nr_pages(folio)); folio->mapping = NULL; folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio_unlock(folio); folio_putback_lru(folio); folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC. */ if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure the update to nr_thps is visible. */ smp_mb(); } new_folio->mapping = NULL; folio_unlock(new_folio); folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); return result; } static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; enum scan_result result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) { swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } if (!folio_try_get(folio)) { xas_reset(&xas); continue; } if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); xas_reset(&xas); continue; } if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; folio_put(folio); break; } if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; folio_put(folio); break; } /* * We probably should check if the folio is referenced * here, but nobody would transfer pte_young() to * folio_test_referenced() for us. And rmap walk here * is just too costly... */ present += folio_nr_pages(folio); folio_put(folio); if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_PTE_MAPPED_HUGEPAGE) cc->progress++; else cc->progress += HPAGE_PMD_NR; if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } } trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } static void khugepaged_scan_mm_slot(unsigned int progress_max, enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; unsigned int progress_prev = cc->progress; lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { slot = khugepaged_scan.mm_slot; } else { slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. */ vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; cc->progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { cc->progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { cc->progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { cc->progress++; continue; } if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { bool mmap_locked = true; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = try_collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; mmap_read_unlock(mm); } } else { *result = hpage_collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; if (!mmap_locked) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage * allocations, so if allocation fails, we are * guaranteed to break here and report the * correct result back to caller. */ goto breakouterloop_mmap_lock; if (cc->progress >= progress_max) goto breakouterloop; } } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm, or THP got disabled. */ if (hpage_collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } collect_mm_slot(slot); } trace_mm_khugepaged_scan(mm, cc->progress - progress_prev, khugepaged_scan.mm_slot == NULL); } static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || kthread_should_stop(); } static void khugepaged_do_scan(struct collapse_control *cc) { const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan); unsigned int pass_through_head = 0; bool wait = true; enum scan_result result = SCAN_SUCCEED; lru_add_drain_all(); cc->progress = 0; while (true) { cond_resched(); if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) khugepaged_scan_mm_slot(progress_max, &result, cc); else cc->progress = progress_max; spin_unlock(&khugepaged_mm_lock); if (cc->progress >= progress_max) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. */ if (!wait) break; wait = false; khugepaged_alloc_sleep(); } } } static bool khugepaged_should_wakeup(void) { return kthread_should_stop() || time_after_eq(jiffies, khugepaged_sleep_expire); } static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (slot) collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; unsigned long recommended_min; if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of * ZONE_MOVABLE since it only has movable pages. */ if (zone_idx(zone) > gfp_zone(GFP_USER)) continue; nr_zones++; } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_nr_pages * nr_zones * MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; /* don't ever allow to reserve more than 5% of the lowmem */ recommended_min = min(recommended_min, (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { int err = 0; mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } bool current_is_khugepaged(void) { return kthread_func(current) == khugepaged; } static int madvise_collapse_errno(enum scan_result r) { /* * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide * actionable feedback to caller, so they may take an appropriate * fallback measure depending on the nature of the failure. */ switch (r) { case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to * specified memory range. khugepaged likely won't be able to collapse * either. */ default: return -EINVAL; } } int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; enum scan_result last_fail = SCAN_FAIL; int thps = 0; bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc_obj(*cc); if (!cc) return -ENOMEM; cc->is_khugepaged = false; cc->progress = 0; mmgrab(mm); lru_add_drain_all(); hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; bool triggered_wb = false; retry: if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; } hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); mmap_read_unlock(mm); mmap_locked = false; *lock_dropped = true; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && mapping_can_writeback(file->f_mapping)) { loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; loff_t lend = lstart + HPAGE_PMD_SIZE - 1; filemap_write_and_wait_range(file->f_mapping, lstart, lend); triggered_wb = true; fput(file); goto retry; } fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *lock_dropped = true; handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); result = try_collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: last_fail = result; /* Other error, exit */ goto out_maybelock; } } out_maybelock: /* Caller expects us to hold mmap_lock on return */ if (!mmap_locked) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); mmdrop(mm); kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); }
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * OSS compatible sequencer driver * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #ifndef __SEQ_OSS_DEVICE_H #define __SEQ_OSS_DEVICE_H #include <linux/time.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <sound/core.h> #include <sound/seq_oss.h> #include <sound/rawmidi.h> #include <sound/seq_kernel.h> #include <sound/info.h> #include "../seq_clientmgr.h" /* max. applications */ #define SNDRV_SEQ_OSS_MAX_CLIENTS 16 #define SNDRV_SEQ_OSS_MAX_SYNTH_DEVS 16 #define SNDRV_SEQ_OSS_MAX_MIDI_DEVS 32 /* version */ #define SNDRV_SEQ_OSS_MAJOR_VERSION 0 #define SNDRV_SEQ_OSS_MINOR_VERSION 1 #define SNDRV_SEQ_OSS_TINY_VERSION 8 #define SNDRV_SEQ_OSS_VERSION_STR "0.1.8" /* device and proc interface name */ #define SNDRV_SEQ_OSS_PROCNAME "oss" /* * type definitions */ typedef unsigned int reltime_t; typedef unsigned int abstime_t; /* * synthesizer channel information */ struct seq_oss_chinfo { int note, vel; }; /* * synthesizer information */ struct seq_oss_synthinfo { struct snd_seq_oss_arg arg; struct seq_oss_chinfo *ch; int nr_voices; int opened; int is_midi; int midi_mapped; }; /* * sequencer client information */ struct seq_oss_devinfo { int index; /* application index */ int cseq; /* sequencer client number */ int port; /* sequencer port number */ int queue; /* sequencer queue number */ struct snd_seq_addr addr; /* address of this device */ int seq_mode; /* sequencer mode */ int file_mode; /* file access */ /* midi device table */ int max_mididev; /* synth device table */ int max_synthdev; struct seq_oss_synthinfo synths[SNDRV_SEQ_OSS_MAX_SYNTH_DEVS]; int synth_opened; /* output queue */ struct seq_oss_writeq *writeq; /* midi input queue */ struct seq_oss_readq *readq; /* timer */ struct seq_oss_timer *timer; }; /* * function prototypes */ /* create/delete OSS sequencer client */ int snd_seq_oss_create_client(void); int snd_seq_oss_delete_client(void); /* device file interface */ int snd_seq_oss_open(struct file *file, int level); void snd_seq_oss_release(struct seq_oss_devinfo *dp); int snd_seq_oss_ioctl(struct seq_oss_devinfo *dp, unsigned int cmd, unsigned long arg); int snd_seq_oss_read(struct seq_oss_devinfo *dev, char __user *buf, int count); int snd_seq_oss_write(struct seq_oss_devinfo *dp, const char __user *buf, int count, struct file *opt); __poll_t snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wait); void snd_seq_oss_reset(struct seq_oss_devinfo *dp); /* proc interface */ void snd_seq_oss_system_info_read(struct snd_info_buffer *buf); void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf); void snd_seq_oss_synth_info_read(struct snd_info_buffer *buf); void snd_seq_oss_readq_info_read(struct seq_oss_readq *q, struct snd_info_buffer *buf); /* file mode macros */ #define is_read_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_READ) #define is_write_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_WRITE) #define is_nonblock_mode(mode) ((mode) & SNDRV_SEQ_OSS_FILE_NONBLOCK) /* dispatch event */ static inline int snd_seq_oss_dispatch(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int atomic, int hop) { return snd_seq_kernel_client_dispatch(dp->cseq, ev, atomic, hop); } /* ioctl for writeq */ static inline int snd_seq_oss_control(struct seq_oss_devinfo *dp, unsigned int type, void *arg) { return snd_seq_kernel_client_ioctl(dp->cseq, type, arg); } /* fill the addresses in header */ static inline void snd_seq_oss_fill_addr(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dest_client, int dest_port) { ev->queue = dp->queue; ev->source = dp->addr; ev->dest.client = dest_client; ev->dest.port = dest_port; } #endif /* __SEQ_OSS_DEVICE_H */
17 17 17 17 17 17 17 17 17 17 9 9 9 15 15 15 15 17 17 17 15 15 9 8 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Facebook */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/build_bug.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static int queue_poll_stat_show(void *data, struct seq_file *m) { return 0; } static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) __acquires(&q->requeue_lock) { struct request_queue *q = m->private; spin_lock_irq(&q->requeue_lock); return seq_list_start(&q->requeue_list, *pos); } static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; return seq_list_next(v, &q->requeue_list, pos); } static void queue_requeue_list_stop(struct seq_file *m, void *v) __releases(&q->requeue_lock) { struct request_queue *q = m->private; spin_unlock_irq(&q->requeue_lock); } static const struct seq_operations queue_requeue_list_seq_ops = { .start = queue_requeue_list_start, .next = queue_requeue_list_next, .stop = queue_requeue_list_stop, .show = blk_mq_debugfs_rq_show, }; static int blk_flags_show(struct seq_file *m, const unsigned long flags, const char *const *flag_name, int flag_name_count) { bool sep = false; int i; for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { if (!(flags & BIT(i))) continue; if (sep) seq_puts(m, "|"); sep = true; if (i < flag_name_count && flag_name[i]) seq_puts(m, flag_name[i]); else seq_printf(m, "%d", i); } return 0; } static int queue_pm_only_show(void *data, struct seq_file *m) { struct request_queue *q = data; seq_printf(m, "%d\n", atomic_read(&q->pm_only)); return 0; } #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX); blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); return 0; } static ssize_t queue_state_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct request_queue *q = data; char opbuf[16] = { }, *op; /* * The "state" attribute is removed when the queue is removed. Don't * allow setting the state on a dying queue to avoid a use-after-free. */ if (blk_queue_dying(q)) return -ENOENT; if (count >= sizeof(opbuf)) { pr_err("%s: operation too long\n", __func__); goto inval; } if (copy_from_user(opbuf, buf, count)) return -EFAULT; op = strstrip(opbuf); if (strcmp(op, "run") == 0) { blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); } else if (strcmp(op, "kick") == 0) { blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; } static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, { }, }; #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX); blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); return 0; } #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(TAG_RR), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); blk_flags_show(m, hctx->flags, hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } #define CMD_FLAG_NAME(name) [__REQ_##name] = #name static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(FAILFAST_DEV), CMD_FLAG_NAME(FAILFAST_TRANSPORT), CMD_FLAG_NAME(FAILFAST_DRIVER), CMD_FLAG_NAME(SYNC), CMD_FLAG_NAME(META), CMD_FLAG_NAME(PRIO), CMD_FLAG_NAME(NOMERGE), CMD_FLAG_NAME(IDLE), CMD_FLAG_NAME(INTEGRITY), CMD_FLAG_NAME(FUA), CMD_FLAG_NAME(PREFLUSH), CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(POLLED), CMD_FLAG_NAME(ALLOC_CACHE), CMD_FLAG_NAME(SWAP), CMD_FLAG_NAME(DRV), CMD_FLAG_NAME(FS_PRIVATE), CMD_FLAG_NAME(ATOMIC), CMD_FLAG_NAME(NOUNMAP), }; #undef CMD_FLAG_NAME #define RQF_NAME(name) [__RQF_##name] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(DONTPREP), RQF_NAME(SCHED_TAGS), RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_PLUGGING), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; #undef RQF_NAME static const char *const blk_mq_rq_state_name_array[] = { [MQ_RQ_IDLE] = "idle", [MQ_RQ_IN_FLIGHT] = "in_flight", [MQ_RQ_COMPLETE] = "complete", }; static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) { if (WARN_ON_ONCE((unsigned int)rq_state >= ARRAY_SIZE(blk_mq_rq_state_name_array))) return "(?)"; return blk_mq_rq_state_name_array[rq_state]; } int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS); BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS); seq_printf(m, "%p {.op=", rq); if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); else seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK), cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); return 0; } EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_lock(&hctx->lock); return seq_list_start(&hctx->dispatch, *pos); } static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct blk_mq_hw_ctx *hctx = m->private; return seq_list_next(v, &hctx->dispatch, pos); } static void hctx_dispatch_stop(struct seq_file *m, void *v) __releases(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_unlock(&hctx->lock); } static const struct seq_operations hctx_dispatch_seq_ops = { .start = hctx_dispatch_start, .next = hctx_dispatch_next, .stop = hctx_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; struct show_busy_params { struct seq_file *m; struct blk_mq_hw_ctx *hctx; }; /* * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to * keep iterating requests. */ static bool hctx_show_busy_rq(struct request *rq, void *data) { const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) __blk_mq_debugfs_rq_show(params->m, rq); return true; } static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; int res; res = mutex_lock_interruptible(&hctx->queue->elevator_lock); if (res) return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, &params); mutex_unlock(&hctx->queue->elevator_lock); return 0; } static const char *const hctx_types[] = { [HCTX_TYPE_DEFAULT] = "default", [HCTX_TYPE_READ] = "read", [HCTX_TYPE_POLL] = "poll", }; static int hctx_type_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); seq_printf(m, "%s\n", hctx_types[hctx->type]); return 0; } static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; sbitmap_bitmap_show(&hctx->ctx_map, m); return 0; } static void blk_mq_debugfs_tags_show(struct seq_file *m, struct blk_mq_tags *tags) { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); sbitmap_queue_show(&tags->breserved_tags, m); } } static int hctx_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->elevator_lock); if (res) return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->elevator_lock); return 0; } static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } static int hctx_dispatch_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%u\n", hctx->dispatch_busy); return 0; } #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_lock(&ctx->lock); \ return seq_list_start(&ctx->rq_lists[type], *pos); \ } \ \ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ return seq_list_next(v, &ctx->rq_lists[type], pos); \ } \ \ static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ __releases(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_unlock(&ctx->lock); \ } \ \ static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ .start = ctx_##name##_rq_list_start, \ .next = ctx_##name##_rq_list_next, \ .stop = ctx_##name##_rq_list_stop, \ .show = blk_mq_debugfs_rq_show, \ } CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; void *data = debugfs_get_aux(m->file); return attr->show(data, m); } static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *m = file->private_data; const struct blk_mq_debugfs_attr *attr = m->private; void *data = debugfs_get_aux(file); /* * Attributes that only implement .seq_ops are read-only and 'attr' is * the same with 'data' in this case. */ if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); } static int blk_mq_debugfs_open(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; void *data = debugfs_get_aux(file); struct seq_file *m; int ret; if (attr->seq_ops) { ret = seq_open(file, attr->seq_ops); if (!ret) { m = file->private_data; m->private = data; } return ret; } if (WARN_ON_ONCE(!attr->show)) return -EPERM; return single_open(file, blk_mq_debugfs_show, inode->i_private); } static int blk_mq_debugfs_release(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; if (attr->show) return single_release(inode, file); return seq_release(inode, file); } static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, .llseek = seq_lseek, .release = blk_mq_debugfs_release, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {}, }; static void debugfs_create_files(struct request_queue *q, struct dentry *parent, void *data, const struct blk_mq_debugfs_attr *attr) { lockdep_assert_held(&q->debugfs_mutex); /* * debugfs_mutex should not be nested under other locks that can be * grabbed while queue is frozen. */ lockdep_assert_not_held(&q->elevator_lock); lockdep_assert_not_held(&q->rq_qos_mutex); if (IS_ERR_OR_NULL(parent)) return; for (; attr->name; attr++) debugfs_create_file_aux(attr->name, attr->mode, parent, (void *)attr, data, &blk_mq_debugfs_fops); } void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); } blk_mq_debugfs_register_rq_qos(q); } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { struct dentry *ctx_dir; char name[20]; snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); debugfs_create_files(hctx->queue, ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; char name[20]; int i; if (!q->debugfs_dir) return; snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); debugfs_create_files(q, hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); } void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; } void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned int memflags; unsigned long i; memflags = blk_debugfs_lock(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); blk_debugfs_unlock(q, memflags); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); } void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. */ if (!q->debugfs_dir) return; if (!e->queue_debugfs_attrs) return; q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) { lockdep_assert_held(&q->debugfs_mutex); debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } static const char *rq_qos_id_to_name(enum rq_qos_id id) { switch (id) { case RQ_QOS_WBT: return "wbt"; case RQ_QOS_LATENCY: return "latency"; case RQ_QOS_COST: return "cost"; } return "unknown"; } static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); lockdep_assert_held(&q->debugfs_mutex); if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; if (!q->rqos_debugfs_dir) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(q, rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { lockdep_assert_held(&q->debugfs_mutex); if (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; while (rqos) { blk_mq_debugfs_register_rqos(rqos); rqos = rqos->next; } } } void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs * directory from blk_register_queue() */ if (!hctx->debugfs_dir) return; if (!e->hctx_debugfs_attrs) return; hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); debugfs_create_files(q, hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&hctx->queue->debugfs_mutex); if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; }
3 16 131 338 338 330 332 28 61 61 22 4 25 10 10 32 10 98 89 9 46 20 26 46 42 4 21 21 63 69 69 63 4 15 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 /* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes * NCM: Network and Communications Management, Inc. * * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * */ #ifndef _NET_BONDING_H #define _NET_BONDING_H #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> #include <linux/cpumask.h> #include <linux/in6.h> #include <linux/netpoll.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/reciprocal_div.h> #include <linux/if_link.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> #include <net/ipv6.h> #include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 #define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 #ifndef __long_aligned #define __long_aligned __attribute__((aligned((sizeof(long))))) #endif #define slave_info(bond_dev, slave_dev, fmt, ...) \ netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_warn(bond_dev, slave_dev, fmt, ...) \ netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_dbg(bond_dev, slave_dev, fmt, ...) \ netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_err(bond_dev, slave_dev, fmt, ...) \ netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define BOND_MODE(bond) ((bond)->params.mode) /* slave list primitives */ #define bond_slave_list(bond) (&(bond)->dev->adj_list.lower) #define bond_has_slaves(bond) !list_empty(bond_slave_list(bond)) /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */ #define bond_first_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->next) : \ NULL) #define bond_last_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) /* Caller must have rcu_read_lock */ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list * @pos: current slave * @iter: list_head * iterator * * Caller must hold RTNL */ #define bond_for_each_slave(bond, pos, iter) \ netdev_for_each_lower_private((bond)->dev, pos, iter) /* Caller must have rcu_read_lock */ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER DECLARE_STATIC_KEY_FALSE(netpoll_block_tx); static inline void block_netpoll_tx(void) { static_branch_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { static_branch_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { if (static_branch_unlikely(&netpoll_block_tx)) return netpoll_tx_running(dev); return 0; } #else #define block_netpoll_tx() #define unblock_netpoll_tx() #define is_netpoll_tx_blocked(dev) (0) #endif DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); struct bond_params { int mode; int xmit_policy; int miimon; u8 num_peer_notif; u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; int fail_over_mac; int updelay; int downdelay; int peer_notif_delay; int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; int tx_queues; int all_slaves_active; int resend_igmp; int lp_interval; int packets_per_slave; int tlb_dynamic_lb; struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; /* all 4 in jiffies */ unsigned long last_link_up; unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; static inline struct slave *to_slave(struct kobject *kobj) { return container_of(kobj, struct slave, kobj); } struct bond_up_slave { unsigned int count; struct rcu_head rcu; struct slave *arr[]; }; /* * Link pseudo-state only used internally by monitors */ #define BOND_LINK_NOCHANGE -1 struct bond_ipsec { struct list_head list; struct xfrm_state *xs; }; /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. */ struct bonding { struct net_device *dev; /* first - useful for panic debug */ struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); /* mode_lock is used for mode-specific locking needs, currently used by: * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and * bond_3ad_state_machine_handler() concurrently and also * the access to the state machine shared variables. * TLB mode (5) - to sync the use and modifications of its hash table * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; spinlock_t stats_lock; u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; struct delayed_work peer_notify_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ ((struct slave *) rcu_dereference(dev->rx_handler_data)) #define bond_slave_get_rtnl(dev) \ ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; unsigned short vlan_id; }; /* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read */ static inline struct slave *bond_get_slave_by_dev(struct bonding *bond, struct net_device *slave_dev) { return netdev_lower_dev_get_private(bond->dev, slave_dev); } static inline struct bonding *bond_get_bond_by_slave(struct slave *slave) { return slave->bond; } static inline bool bond_should_override_tx_queue(struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN; } static inline bool bond_is_lb(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB; } static inline bool bond_needs_speed_duplex(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); } static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); } static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || bond_is_nondyn_tlb(bond)); } static inline bool bond_mode_uses_arp(int mode) { return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB && mode != BOND_MODE_ALB; } static inline bool bond_mode_uses_primary(int mode) { return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB || mode == BOND_MODE_ALB; } static inline bool bond_uses_primary(struct bonding *bond) { return bond_mode_uses_primary(BOND_MODE(bond)); } static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } static inline bool bond_slave_is_up(struct slave *slave) { return netif_running(slave->dev) && netif_carrier_ok(slave->dev); } static inline void bond_set_active_slave(struct slave *slave) { if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_backup_slave(struct slave *slave) { if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_slave_state(struct slave *slave, int slave_state, bool notify) { if (slave->backup == slave_state) return; slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); bond_queue_slave_event(slave); slave->should_notify = 0; } else { if (slave->should_notify) slave->should_notify = 0; else slave->should_notify = 1; } } static inline void bond_slave_state_change(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->link == BOND_LINK_UP) bond_set_active_slave(tmp); else if (tmp->link == BOND_LINK_DOWN) bond_set_backup_slave(tmp); } } static inline void bond_slave_state_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); tmp->should_notify = 0; } } } static inline int bond_slave_state(struct slave *slave) { return slave->backup; } static inline bool bond_is_active_slave(struct slave *slave) { return !bond_slave_state(slave); } static inline bool bond_slave_can_tx(struct slave *slave) { return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP && bond_is_active_slave(slave); } static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) { struct slave *slave; bool active; rcu_read_lock(); slave = bond_slave_get_rcu(slave_dev); active = bond_is_active_slave(slave); rcu_read_unlock(); return active; } static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { ether_addr_copy(dst, src); return; } memcpy(dst, src, len); } #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 #define BOND_FOM_NONE 0 #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 #define BOND_ARP_TARGETS_ANY 0 #define BOND_ARP_TARGETS_ALL 1 #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) #define BOND_ARP_VALIDATE_ALL (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_VALIDATE_BACKUP) #define BOND_ARP_FILTER (BOND_ARP_VALIDATE_ALL + 1) #define BOND_ARP_FILTER_ACTIVE (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_FILTER) #define BOND_ARP_FILTER_BACKUP (BOND_ARP_VALIDATE_BACKUP | \ BOND_ARP_FILTER) #define BOND_SLAVE_NOTIFY_NOW true #define BOND_SLAVE_NOTIFY_LATER false static inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave) { return bond->params.arp_validate & (1 << bond_slave_state(slave)); } static inline int slave_do_arp_validate_only(struct bonding *bond) { return bond->params.arp_validate & BOND_ARP_FILTER; } static inline int bond_is_ip_target_ok(__be32 addr) { return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } #endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { unsigned long tmp, ret = READ_ONCE(slave->target_last_arp_rx[0]); int i = 1; for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) { tmp = READ_ONCE(slave->target_last_arp_rx[i]); if (time_before(tmp, ret)) ret = tmp; } return ret; } static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); return READ_ONCE(slave->last_rx); } static inline void slave_update_last_tx(struct slave *slave) { WRITE_ONCE(slave->last_tx, jiffies); } static inline unsigned long slave_last_tx(struct slave *slave) { return READ_ONCE(slave->last_tx); } #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { return netpoll_send_skb(slave->np, skb); } #else static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { BUG(); return NETDEV_TX_OK; } #endif static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify) { if (!bond_is_lb(slave->bond)) bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 1; } static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 0; } static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, bool notify) { slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) { return slave->inactive; } static inline bool bond_is_slave_rx_disabled(struct slave *slave) { return slave->rx_disabled; } static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; } static inline void bond_commit_link_state(struct slave *slave, bool notify) { if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) slave->should_notify_link = 0; else slave->should_notify_link = 1; } } static inline void bond_set_slave_link_state(struct slave *slave, int state, bool notify) { bond_propose_link_state(slave, state); bond_commit_link_state(slave, notify); } static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) { struct in_device *in_dev; __be32 addr = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local, RT_SCOPE_HOST); rcu_read_unlock(); return addr; } struct bond_net { struct net *net; /* Associated network namespace */ struct list_head dev_list; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_dir; #endif struct class_attribute class_attr_bonding_masters; }; int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); void bond_select_active_slave(struct bonding *bond); void bond_change_active_slave(struct bonding *bond, struct slave *new_active); void bond_create_debugfs(void); void bond_destroy_debugfs(void); void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); void bond_netlink_fini(void); struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond); const char *bond_slave_link_status(s8 link); struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); void bond_remove_proc_entry(struct bonding *bond); void bond_create_proc_dir(struct bond_net *bn); void bond_destroy_proc_dir(struct bond_net *bn); #else static inline void bond_create_proc_entry(struct bonding *bond) { } static inline void bond_remove_proc_entry(struct bonding *bond) { } static inline void bond_create_proc_dir(struct bond_net *bn) { } static inline void bond_destroy_proc_dir(struct bond_net *bn) { } #endif static inline struct slave *bond_slave_has_mac(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return tmp; return NULL; } /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; return false; } /* Check if the ip is present in arp ip list, or first free slot if ip == 0 * Returns -1 if not found, index if found */ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) if (targets[i] == ip) return i; else if (targets[i] == 0) break; return -1; } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { struct in6_addr mcaddr; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { addrconf_addr_solict_mult(&targets[i], &mcaddr); if ((ipv6_addr_equal(&targets[i], ip)) || (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; } return -1; } #endif /* exported from bond_main.c */ extern unsigned int bond_net_id; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; /* exported from bond_sysfs_slave.c */ extern const struct sysfs_ops slave_sysfs_ops; /* exported from bond_3ad.c */ extern const u8 lacpdu_mcast_addr[]; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } #endif /* _NET_BONDING_H */
16 3 7 9 4 2 4 4 7 45 36 9 3 45 4 54 58 58 2 11 9 29 34 1 1 32 1 1 28 24 4 20 1 17 1 57 57 56 14 3 2 39 38 1 50 30 2 8 38 25 1033 1037 162 886 80 2 2 2 3 2 2 2 2 2 2 2 1 1 3 1 3 3 4 1 4 2 2 1 4 2 2 1 3 1 16 2 1 2 4 3 1 1 2 1 1 1 1 576 3 1 2 7 4 4 3 1 2 3 5 1 1 2 2 2 3 1 1 1 2 1 3 1 1 2 3 2 2 1 2 2 1 2 1 1 2 2 1 1 2 2 1 3 2 3 3 4 2 4 1 2 1 1 3 2 5 1 1 21 6 2 1 3 3 2 1 39 13 3 24 32 1 6 7 126 1 42 2 1129 34 62 681 356 10 2 1 2 2 3 2 5 15 1 14 1 5 7 162 164 7 156 41 6 4 2 15 1 24 1 10 23 10 26 8 25 9 24 10 29 5 3 1 2 1 1 1 1 1 1 10 2 8 1 1 1 1 1 1 1 1 1 5 2 4 2 1 3 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 3 1 3 1 1 1 2 1 1 2 1 43 43 187 4 21 131 1 22 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <net/psp.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc_obj(*new_ra) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = psp_sk_overhead(sk) + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int retv = -ENOPROTOOPT; int val, valbool; if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst6_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_derefer