Total coverage: 111044 (7%)of 1824589
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: LGPL-2.1 /* * A V4L2 frontend for the FWHT codec * * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/string.h> #include <linux/videodev2.h> #include "codec-v4l2-fwht.h" static const struct v4l2_fwht_pixfmt_info v4l2_fwht_pixfmts[] = { { V4L2_PIX_FMT_YUV420, 1, 3, 2, 1, 1, 2, 2, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YVU420, 1, 3, 2, 1, 1, 2, 2, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YUV422P, 1, 2, 1, 1, 1, 2, 1, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV12, 1, 3, 2, 1, 2, 2, 2, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV21, 1, 3, 2, 1, 2, 2, 2, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV16, 1, 2, 1, 1, 2, 2, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV61, 1, 2, 1, 1, 2, 2, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV24, 1, 3, 1, 1, 2, 1, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV42, 1, 3, 1, 1, 2, 1, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YUYV, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YVYU, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_UYVY, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_VYUY, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_BGR24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGB24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_HSV24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_HSV}, { V4L2_PIX_FMT_BGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_XBGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_ABGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_XRGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_ARGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_BGRX32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_BGRA32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGBX32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGBA32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_HSV32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_HSV}, { V4L2_PIX_FMT_GREY, 1, 1, 1, 1, 0, 1, 1, 1, 1, V4L2_FWHT_FL_PIXENC_RGB}, }; bool v4l2_fwht_validate_fmt(const struct v4l2_fwht_pixfmt_info *info, u32 width_div, u32 height_div, u32 components_num, u32 pixenc) { if (info->width_div == width_div && info->height_div == height_div && (!pixenc || info->pixenc == pixenc) && info->components_num == components_num) return true; return false; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_find_nth_fmt(u32 width_div, u32 height_div, u32 components_num, u32 pixenc, unsigned int start_idx) { unsigned int i; for (i = 0; i < ARRAY_SIZE(v4l2_fwht_pixfmts); i++) { bool is_valid = v4l2_fwht_validate_fmt(&v4l2_fwht_pixfmts[i], width_div, height_div, components_num, pixenc); if (is_valid) { if (start_idx == 0) return v4l2_fwht_pixfmts + i; start_idx--; } } return NULL; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_find_pixfmt(u32 pixelformat) { unsigned int i; for (i = 0; i < ARRAY_SIZE(v4l2_fwht_pixfmts); i++) if (v4l2_fwht_pixfmts[i].id == pixelformat) return v4l2_fwht_pixfmts + i; return NULL; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_get_pixfmt(u32 idx) { if (idx >= ARRAY_SIZE(v4l2_fwht_pixfmts)) return NULL; return v4l2_fwht_pixfmts + idx; } static int prepare_raw_frame(struct fwht_raw_frame *rf, const struct v4l2_fwht_pixfmt_info *info, u8 *buf, unsigned int size) { rf->luma = buf; rf->width_div = info->width_div; rf->height_div = info->height_div; rf->luma_alpha_step = info->luma_alpha_step; rf->chroma_step = info->chroma_step; rf->alpha = NULL; rf->components_num = info->components_num; /* * The buffer is NULL if it is the reference * frame of an I-frame in the stateless decoder */ if (!buf) { rf->luma = NULL; rf->cb = NULL; rf->cr = NULL; rf->alpha = NULL; return 0; } switch (info->id) { case V4L2_PIX_FMT_GREY: rf->cb = NULL; rf->cr = NULL; break; case V4L2_PIX_FMT_YUV420: rf->cb = rf->luma + size; rf->cr = rf->cb + size / 4; break; case V4L2_PIX_FMT_YVU420: rf->cr = rf->luma + size; rf->cb = rf->cr + size / 4; break; case V4L2_PIX_FMT_YUV422P: rf->cb = rf->luma + size; rf->cr = rf->cb + size / 2; break; case V4L2_PIX_FMT_NV12: case V4L2_PIX_FMT_NV16: case V4L2_PIX_FMT_NV24: rf->cb = rf->luma + size; rf->cr = rf->cb + 1; break; case V4L2_PIX_FMT_NV21: case V4L2_PIX_FMT_NV61: case V4L2_PIX_FMT_NV42: rf->cr = rf->luma + size; rf->cb = rf->cr + 1; break; case V4L2_PIX_FMT_YUYV: rf->cb = rf->luma + 1; rf->cr = rf->cb + 2; break; case V4L2_PIX_FMT_YVYU: rf->cr = rf->luma + 1; rf->cb = rf->cr + 2; break; case V4L2_PIX_FMT_UYVY: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; break; case V4L2_PIX_FMT_VYUY: rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; case V4L2_PIX_FMT_RGB24: case V4L2_PIX_FMT_HSV24: rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; case V4L2_PIX_FMT_BGR24: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; break; case V4L2_PIX_FMT_RGB32: case V4L2_PIX_FMT_XRGB32: case V4L2_PIX_FMT_HSV32: case V4L2_PIX_FMT_ARGB32: rf->alpha = rf->luma; rf->cr = rf->luma + 1; rf->cb = rf->cr + 2; rf->luma += 2; break; case V4L2_PIX_FMT_BGR32: case V4L2_PIX_FMT_XBGR32: case V4L2_PIX_FMT_ABGR32: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; rf->alpha = rf->cr + 1; break; case V4L2_PIX_FMT_BGRX32: case V4L2_PIX_FMT_BGRA32: rf->alpha = rf->luma; rf->cb = rf->luma + 1; rf->cr = rf->cb + 2; rf->luma += 2; break; case V4L2_PIX_FMT_RGBX32: case V4L2_PIX_FMT_RGBA32: rf->alpha = rf->luma + 3; rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; default: return -EINVAL; } return 0; } int v4l2_fwht_encode(struct v4l2_fwht_state *state, u8 *p_in, u8 *p_out) { unsigned int size = state->stride * state->coded_height; unsigned int chroma_stride = state->stride; const struct v4l2_fwht_pixfmt_info *info = state->info; struct fwht_cframe_hdr *p_hdr; struct fwht_cframe cf; struct fwht_raw_frame rf; u32 encoding; u32 flags = 0; if (!info) return -EINVAL; if (prepare_raw_frame(&rf, info, p_in, size)) return -EINVAL; if (info->planes_num == 3) chroma_stride /= 2; if (info->id == V4L2_PIX_FMT_NV24 || info->id == V4L2_PIX_FMT_NV42) chroma_stride *= 2; cf.i_frame_qp = state->i_frame_qp; cf.p_frame_qp = state->p_frame_qp; cf.rlc_data = (__be16 *)(p_out + sizeof(*p_hdr)); encoding = fwht_encode_frame(&rf, &state->ref_frame, &cf, !state->gop_cnt, state->gop_cnt == state->gop_size - 1, state->visible_width, state->visible_height, state->stride, chroma_stride); if (!(encoding & FWHT_FRAME_PCODED)) state->gop_cnt = 0; if (++state->gop_cnt >= state->gop_size) state->gop_cnt = 0; p_hdr = (struct fwht_cframe_hdr *)p_out; p_hdr->magic1 = FWHT_MAGIC1; p_hdr->magic2 = FWHT_MAGIC2; p_hdr->version = htonl(V4L2_FWHT_VERSION); p_hdr->width = htonl(state->visible_width); p_hdr->height = htonl(state->visible_height); flags |= (info->components_num - 1) << V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET; flags |= info->pixenc; if (encoding & FWHT_LUMA_UNENCODED) flags |= V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED; if (encoding & FWHT_CB_UNENCODED) flags |= V4L2_FWHT_FL_CB_IS_UNCOMPRESSED; if (encoding & FWHT_CR_UNENCODED) flags |= V4L2_FWHT_FL_CR_IS_UNCOMPRESSED; if (encoding & FWHT_ALPHA_UNENCODED) flags |= V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED; if (!(encoding & FWHT_FRAME_PCODED)) flags |= V4L2_FWHT_FL_I_FRAME; if (rf.height_div == 1) flags |= V4L2_FWHT_FL_CHROMA_FULL_HEIGHT; if (rf.width_div == 1) flags |= V4L2_FWHT_FL_CHROMA_FULL_WIDTH; p_hdr->flags = htonl(flags); p_hdr->colorspace = htonl(state->colorspace); p_hdr->xfer_func = htonl(state->xfer_func); p_hdr->ycbcr_enc = htonl(state->ycbcr_enc); p_hdr->quantization = htonl(state->quantization); p_hdr->size = htonl(cf.size); return cf.size + sizeof(*p_hdr); } int v4l2_fwht_decode(struct v4l2_fwht_state *state, u8 *p_in, u8 *p_out) { u32 flags; struct fwht_cframe cf; unsigned int components_num = 3; unsigned int version; const struct v4l2_fwht_pixfmt_info *info; unsigned int hdr_width_div, hdr_height_div; struct fwht_raw_frame dst_rf; unsigned int dst_chroma_stride = state->stride; unsigned int ref_chroma_stride = state->ref_stride; unsigned int dst_size = state->stride * state->coded_height; unsigned int ref_size; if (!state->info) return -EINVAL; info = state->info; version = ntohl(state->header.version); if (!version || version > V4L2_FWHT_VERSION) { pr_err("version %d is not supported, current version is %d\n", version, V4L2_FWHT_VERSION); return -EINVAL; } if (state->header.magic1 != FWHT_MAGIC1 || state->header.magic2 != FWHT_MAGIC2) return -EINVAL; /* TODO: support resolution changes */ if (ntohl(state->header.width) != state->visible_width || ntohl(state->header.height) != state->visible_height) return -EINVAL; flags = ntohl(state->header.flags); if (version >= 2) { if ((flags & V4L2_FWHT_FL_PIXENC_MSK) != info->pixenc) return -EINVAL; components_num = 1 + ((flags & V4L2_FWHT_FL_COMPONENTS_NUM_MSK) >> V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); } if (components_num != info->components_num) return -EINVAL; state->colorspace = ntohl(state->header.colorspace); state->xfer_func = ntohl(state->header.xfer_func); state->ycbcr_enc = ntohl(state->header.ycbcr_enc); state->quantization = ntohl(state->header.quantization); cf.rlc_data = (__be16 *)p_in; cf.size = ntohl(state->header.size); hdr_width_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; hdr_height_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; if (hdr_width_div != info->width_div || hdr_height_div != info->height_div) return -EINVAL; if (prepare_raw_frame(&dst_rf, info, p_out, dst_size)) return -EINVAL; if (info->planes_num == 3) { dst_chroma_stride /= 2; ref_chroma_stride /= 2; } if (info->id == V4L2_PIX_FMT_NV24 || info->id == V4L2_PIX_FMT_NV42) { dst_chroma_stride *= 2; ref_chroma_stride *= 2; } ref_size = state->ref_stride * state->coded_height; if (prepare_raw_frame(&state->ref_frame, info, state->ref_frame.buf, ref_size)) return -EINVAL; if (!fwht_decode_frame(&cf, flags, components_num, state->visible_width, state->visible_height, &state->ref_frame, state->ref_stride, ref_chroma_stride, &dst_rf, state->stride, dst_chroma_stride)) return -EINVAL; return 0; }
46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 /* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * This file contains the main data structure and API definitions. */ #ifndef __LINUX_MUTEX_H #define __LINUX_MUTEX_H #include <asm/current.h> #include <linux/list.h> #include <linux/spinlock_types.h> #include <linux/lockdep.h> #include <linux/atomic.h> #include <asm/processor.h> #include <linux/osq_lock.h> #include <linux/debug_locks.h> #include <linux/cleanup.h> #include <linux/mutex_types.h> struct device; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ , .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ } #else # define __DEP_MAP_MUTEX_INITIALIZER(lockname) #endif #ifdef CONFIG_DEBUG_MUTEXES # define __DEBUG_MUTEX_INITIALIZER(lockname) \ , .magic = &lockname extern void mutex_destroy(struct mutex *lock); #else # define __DEBUG_MUTEX_INITIALIZER(lockname) static inline void mutex_destroy(struct mutex *lock) {} #endif /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized * * Initialize the mutex to unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ \ __mutex_init((mutex), #mutex, &__key); \ } while (0) /** * mutex_init_with_key - initialize a mutex with a given lockdep key * @mutex: the mutex to be initialized * @key: the lockdep key to be associated with the mutex * * Initialize the mutex to the unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key)) #ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); /** * mutex_is_locked - is the mutex locked * @lock: the mutex to be queried * * Returns true if the mutex is locked, false if unlocked. */ extern bool mutex_is_locked(struct mutex *lock); #else /* !CONFIG_PREEMPT_RT */ /* * Preempt-RT variant based on rtmutexes. */ #define __MUTEX_INITIALIZER(mutexname) \ { \ .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex) \ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_rt_init(struct mutex *lock, const char *name, struct lock_class_key *key); #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) #define __mutex_init(mutex, name, key) \ do { \ rt_mutex_base_init(&(mutex)->rtmutex); \ __mutex_rt_init((mutex), name, key); \ } while (0) #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES int __devm_mutex_init(struct device *dev, struct mutex *lock); #else static inline int __devm_mutex_init(struct device *dev, struct mutex *lock) { /* * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so * no really need to register it in the devm subsystem. */ return 0; } #endif #define devm_mutex_init(dev, mutex) \ ({ \ typeof(mutex) mutex_ = (mutex); \ \ mutex_init(mutex_); \ __devm_mutex_init(dev, mutex_); \ }) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass); extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) # define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! * * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) #endif /* __LINUX_MUTEX_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_UDP_TUNNEL_H #define __NET_UDP_TUNNEL_H #include <net/ip_tunnels.h> #include <net/udp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ipv6_stubs.h> #endif struct udp_port_cfg { u8 family; /* Used only for kernel-created sockets */ union { struct in_addr local_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr local_ip6; #endif }; union { struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr peer_ip6; #endif }; __be16 local_udp_port; __be16 peer_udp_port; int bind_ifindex; unsigned int use_udp_checksums:1, use_udp6_tx_checksums:1, use_udp6_rx_checksums:1, ipv6_v6only:1; }; int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp); #if IS_ENABLED(CONFIG_IPV6) int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp); #else static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { return 0; } #endif static inline int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { if (cfg->family == AF_INET) return udp_sock_create4(net, cfg, sockp); if (cfg->family == AF_INET6) return udp_sock_create6(net, cfg, sockp); return -EPFNOSUPPORT; } typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, struct sk_buff *skb); typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, int nhoff); struct udp_tunnel_sock_cfg { void *sk_user_data; /* user data used by encap_rcv call back */ /* Used for setting up udp_sock fields, see udp.h for details */ __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; udp_tunnel_encap_err_lookup_t encap_err_lookup; udp_tunnel_encap_err_rcv_t encap_err_rcv; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; }; /* Setup the given (UDP) sock to receive UDP encapsulated packets */ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *sock_cfg); /* -- List of parsable UDP tunnel types -- * * Adding to this list will result in serious debate. The main issue is * that this list is essentially a list of workarounds for either poorly * designed tunnels, or poorly designed device offloads. * * The parsing supported via these types should really be used for Rx * traffic only as the network stack will have already inserted offsets for * the location of the headers in the skb. In addition any ports that are * pushed should be kept within the namespace without leaking to other * devices such as VFs or other ports on the same device. * * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the * need to use this for Rx checksum offload. It should not be necessary to * call this function to perform Tx offloads on outgoing traffic. */ enum udp_parsable_tunnel_type { UDP_TUNNEL_TYPE_VXLAN = BIT(0), /* RFC 7348 */ UDP_TUNNEL_TYPE_GENEVE = BIT(1), /* draft-ietf-nvo3-geneve */ UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */ }; struct udp_tunnel_info { unsigned short type; sa_family_t sa_family; __be16 port; u8 hw_priv; }; /* Notify network devices of offloadable types */ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type); void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); static inline void udp_tunnel_get_rx_info(struct net_device *dev) { ASSERT_RTNL(); if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); } static inline void udp_tunnel_drop_rx_info(struct net_device *dev) { ASSERT_RTNL(); if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); } /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck); void udp_tunnel_sock_release(struct socket *sock); struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, int oif, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, struct dst_cache *dst_cache); struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, int oif, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, struct dst_cache *dst_cache); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, const unsigned long *flags, __be64 tunnel_id, int md_size); #ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; return iptunnel_handle_offloads(skb, type); } #endif static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) return; #if IS_ENABLED(CONFIG_IPV6) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif udp_encap_enable(); } #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { /* Device callbacks may sleep */ UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), /* Device supports only IPv4 tunnels */ UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), }; struct udp_tunnel_nic; #define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES (U16_MAX / 2) struct udp_tunnel_nic_shared { struct udp_tunnel_nic *udp_tunnel_nic_info; struct list_head devices; }; struct udp_tunnel_nic_shared_node { struct net_device *dev; struct list_head list; }; /** * struct udp_tunnel_nic_info - driver UDP tunnel offload information * @set_port: callback for adding a new port * @unset_port: callback for removing a port * @sync_table: callback for syncing the entire port table at once * @shared: reference to device global state (optional) * @flags: device flags from enum udp_tunnel_nic_info_flags * @tables: UDP port tables this device has * @tables.n_entries: number of entries in this table * @tables.tunnel_types: types of tunnels this table accepts * * Drivers are expected to provide either @set_port and @unset_port callbacks * or the @sync_table callback. Callbacks are invoked with rtnl lock held. * * Devices which (misguidedly) share the UDP tunnel port table across multiple * netdevs should allocate an instance of struct udp_tunnel_nic_shared and * point @shared at it. * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices * sharing a table. * * Known limitations: * - UDP tunnel port notifications are fundamentally best-effort - * it is likely the driver will both see skbs which use a UDP tunnel port, * while not being a tunneled skb, and tunnel skbs from other ports - * drivers should only use these ports for non-critical RX-side offloads, * e.g. the checksum offload; * - none of the devices care about the socket family at present, so we don't * track it. Please extend this code if you care. */ struct udp_tunnel_nic_info { /* one-by-one */ int (*set_port)(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti); int (*unset_port)(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti); /* all at once */ int (*sync_table)(struct net_device *dev, unsigned int table); struct udp_tunnel_nic_shared *shared; unsigned int flags; struct udp_tunnel_nic_table_info { unsigned int n_entries; unsigned int tunnel_types; } tables[UDP_TUNNEL_NIC_MAX_TABLES]; }; /* UDP tunnel module dependencies * * Tunnel drivers are expected to have a hard dependency on the udp_tunnel * module. NIC drivers are not, they just attach their * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come. * Loading a tunnel driver will cause the udp_tunnel module to be loaded * and only then will all the required state structures be allocated. * Since we want a weak dependency from the drivers and the core to udp_tunnel * we call things through the following stubs. */ struct udp_tunnel_nic_ops { void (*get_port)(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti); void (*set_port_priv)(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv); void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*reset_ntf)(struct net_device *dev); size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); }; #ifdef CONFIG_INET extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; #else #define udp_tunnel_nic_ops ((struct udp_tunnel_nic_ops *)NULL) #endif static inline void udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { /* This helper is used from .sync_table, we indicate empty entries * by zero'ed @ti. Drivers which need to know the details of a port * when it gets deleted should use the .set_port / .unset_port * callbacks. * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings. */ memset(ti, 0, sizeof(*ti)); if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->get_port(dev, table, idx, ti); } static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); } static inline void udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->add_port(dev, ti); } static inline void udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->del_port(dev, ti); } /** * udp_tunnel_nic_reset_ntf() - device-originating reset notification * @dev: network interface device structure * * Called by the driver to inform the core that the entire UDP tunnel port * state has been lost, usually due to device reset. Core will assume device * forgot all the ports and issue .set_port and .sync_table callbacks as * necessary. * * This function must be called with rtnl lock held, and will issue all * the callbacks before returning. */ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) { if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->reset_ntf(dev); } static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { if (!udp_tunnel_nic_ops) return 0; return udp_tunnel_nic_ops->dump_size(dev, table); } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { if (!udp_tunnel_nic_ops) return 0; return udp_tunnel_nic_ops->dump_write(dev, table, skb); } #endif
144 131 145 145 31 1 11 100 30 33 26 12 12 11 1 142 126 133 141 2 2 142 141 2 92 3 44 21 1 142 140 2 63 12 13 13 7 73 73 2 141 144 1 144 144 137 41 2 46 141 141 8 133 21 120 13 2 1 13 131 131 44 43 32 3 18 15 9 7 3 8 7 3 3 1 1 11 11 4 3 4 4 39 38 30 31 37 3 17 5 5 5 4 5 2 2 1 1 2 2 11 11 2 11 8 8 3 14 14 5 1 3 27 9 8 12 29 31 29 10 38 12 12 5 11 1 11 11 11 14 14 39 31 29 39 31 29 30 31 39 39 39 39 12 11 6 9 10 1 5 1 1 1 1 3 2 1 4 4 4 12 4 3 11 11 1 3 11 4 11 5 12 12 7 7 13 13 6 12 12 12 16 16 16 13 4 3 12 1 12 13 5 3 2 2 5 5 25 13 10 13 16 24 30 30 30 30 24 24 24 24 13 16 24 10 10 10 1 9 13 25 30 10 24 30 30 7 7 2 7 5 7 7 5 7 7 5 7 9 5 5 5 7 9 9 7 5 5 9 9 9 9 9 9 9 7 7 7 7 7 7 98 73 73 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS B-tree. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Koji Sato. */ #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/pagevec.h> #include "nilfs.h" #include "page.h" #include "btnode.h" #include "btree.h" #include "alloc.h" #include "dat.h" static void __nilfs_btree_init(struct nilfs_bmap *bmap); static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { struct nilfs_btree_path *path; int level = NILFS_BTREE_LEVEL_DATA; path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); if (path == NULL) goto out; for (; level < NILFS_BTREE_LEVEL_MAX; level++) { path[level].bp_bh = NULL; path[level].bp_sib_bh = NULL; path[level].bp_index = 0; path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; path[level].bp_op = NULL; } out: return path; } static void nilfs_btree_free_path(struct nilfs_btree_path *path) { int level = NILFS_BTREE_LEVEL_DATA; for (; level < NILFS_BTREE_LEVEL_MAX; level++) brelse(path[level].bp_bh); kmem_cache_free(nilfs_btree_path_cache, path); } /* * B-tree node operations */ static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); if (IS_ERR(bh)) return PTR_ERR(bh); set_buffer_nilfs_volatile(bh); *bhp = bh; return 0; } static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } static void nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } static int nilfs_btree_node_root(const struct nilfs_btree_node *node) { return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } static void nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } static void nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } static int nilfs_btree_node_size(const struct nilfs_bmap *btree) { return i_blocksize(btree->b_inode); } static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree) { return btree->b_nchildren_per_block; } static __le64 * nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + (nilfs_btree_node_root(node) ? 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } static __le64 * nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax) { return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax); } static __u64 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index)); } static void nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key); } static __u64 nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index, int ncmax) { return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index)); } static void nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr, int ncmax) { *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr); } static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags, int level, int nchildren, int ncmax, const __u64 *keys, const __u64 *ptrs) { __le64 *dkeys; __le64 *dptrs; int i; nilfs_btree_node_set_flags(node, flags); nilfs_btree_node_set_level(node, level); nilfs_btree_node_set_nchildren(node, nchildren); dkeys = nilfs_btree_node_dkeys(node); dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nchildren; i++) { dkeys[i] = cpu_to_le64(keys[i]); dptrs[i] = cpu_to_le64(ptrs[i]); } } /* Assume the buffer heads corresponding to left and right are locked. */ static void nilfs_btree_node_move_left(struct nilfs_btree_node *left, struct nilfs_btree_node *right, int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); lnchildren += n; rnchildren -= n; nilfs_btree_node_set_nchildren(left, lnchildren); nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer heads corresponding to left and right are locked. */ static void nilfs_btree_node_move_right(struct nilfs_btree_node *left, struct nilfs_btree_node *right, int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); lnchildren -= n; rnchildren += n; nilfs_btree_node_set_nchildren(left, lnchildren); nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer head corresponding to node is locked. */ static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index, __u64 key, __u64 ptr, int ncmax) { __le64 *dkeys; __le64 *dptrs; int nchildren; dkeys = nilfs_btree_node_dkeys(node); dptrs = nilfs_btree_node_dptrs(node, ncmax); nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, (nchildren - index) * sizeof(*dkeys)); memmove(dptrs + index + 1, dptrs + index, (nchildren - index) * sizeof(*dptrs)); } dkeys[index] = cpu_to_le64(key); dptrs[index] = cpu_to_le64(ptr); nchildren++; nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index, __u64 *keyp, __u64 *ptrp, int ncmax) { __u64 key; __u64 ptr; __le64 *dkeys; __le64 *dptrs; int nchildren; dkeys = nilfs_btree_node_dkeys(node); dptrs = nilfs_btree_node_dptrs(node, ncmax); key = le64_to_cpu(dkeys[index]); ptr = le64_to_cpu(dptrs[index]); nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; if (ptrp != NULL) *ptrp = ptr; if (index < nchildren - 1) { memmove(dkeys + index, dkeys + index + 1, (nchildren - index - 1) * sizeof(*dkeys)); memmove(dptrs + index, dptrs + index + 1, (nchildren - index - 1) * sizeof(*dptrs)); } nchildren--; nilfs_btree_node_set_nchildren(node, nchildren); } static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, __u64 key, int *indexp) { __u64 nkey; int index, low, high, s; /* binary search */ low = 0; high = nilfs_btree_node_get_nchildren(node) - 1; index = 0; s = 0; while (low <= high) { index = (low + high) / 2; nkey = nilfs_btree_node_get_key(node, index); if (nkey == key) { s = 0; goto out; } else if (nkey < key) { low = index + 1; s = -1; } else { high = index - 1; s = 1; } } /* adjust index */ if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { if (s > 0 && index > 0) index--; } else if (s < 0) index++; out: *indexp = index; return s == 0; } /** * nilfs_btree_node_broken - verify consistency of btree node * @node: btree node block to be examined * @size: node size (in bytes) * @inode: host inode of btree * @blocknr: block number * * Return: 0 if normal, 1 if the node is broken. */ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, size_t size, struct inode *inode, sector_t blocknr) { int level, flags, nchildren; int ret = 0; level = nilfs_btree_node_get_level(node); flags = nilfs_btree_node_get_flags(node); nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || level >= NILFS_BTREE_LEVEL_MAX || (flags & NILFS_BTREE_NODE_ROOT) || nchildren <= 0 || nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { nilfs_crit(inode->i_sb, "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, (unsigned long long)blocknr, level, flags, nchildren); ret = 1; } return ret; } /** * nilfs_btree_root_broken - verify consistency of btree root node * @node: btree root node to be examined * @inode: host inode of btree * * Return: 0 if normal, 1 if the root node is broken. */ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, struct inode *inode) { int level, flags, nchildren; int ret = 0; level = nilfs_btree_node_get_level(node); flags = nilfs_btree_node_get_flags(node); nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX || (nchildren == 0 && level > NILFS_BTREE_LEVEL_NODE_MIN))) { nilfs_crit(inode->i_sb, "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, level, flags, nchildren); ret = 1; } return ret; } int nilfs_btree_broken_node_block(struct buffer_head *bh) { struct inode *inode; int ret; if (buffer_nilfs_checked(bh)) return 0; inode = bh->b_folio->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) set_buffer_nilfs_checked(bh); return ret; } static struct nilfs_btree_node * nilfs_btree_get_root(const struct nilfs_bmap *btree) { return (struct nilfs_btree_node *)btree->b_u.u_data; } static struct nilfs_btree_node * nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } static struct nilfs_btree_node * nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } static int nilfs_btree_height(const struct nilfs_bmap *btree) { return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } static struct nilfs_btree_node * nilfs_btree_get_node(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, int level, int *ncmaxp) { struct nilfs_btree_node *node; if (level == nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_root(btree); *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX; } else { node = nilfs_btree_get_nonroot_node(path, level); *ncmaxp = nilfs_btree_nchildren_per_block(btree); } return node; } static int nilfs_btree_bad_node(const struct nilfs_bmap *btree, struct nilfs_btree_node *node, int level) { if (unlikely(nilfs_btree_node_get_level(node) != level)) { dump_stack(); nilfs_crit(btree->b_inode->i_sb, "btree level mismatch (ino=%lu): %d != %d", btree->b_inode->i_ino, nilfs_btree_node_get_level(node), level); return 1; } return 0; } struct nilfs_btree_readahead_info { struct nilfs_btree_node *node; /* parent node */ int max_ra_blocks; /* max nof blocks to read ahead */ int index; /* current index on the parent node */ int ncmax; /* nof children in the parent node */ }; static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp, const struct nilfs_btree_readahead_info *ra) { struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh, *ra_bh; sector_t submit_ptr = 0; int ret; ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh, &submit_ptr); if (ret) { if (likely(ret == -EEXIST)) goto out_check; if (ret == -ENOENT) { /* * Block address translation failed due to invalid * value of 'ptr'. In this case, return internal code * -EINVAL (broken bmap) to notify bmap layer of fatal * metadata corruption. */ ret = -EINVAL; } return ret; } if (ra) { int i, n; __u64 ptr2; /* read ahead sibling nodes */ for (n = ra->max_ra_blocks, i = ra->index + 1; n > 0 && i < ra->ncmax; n--, i++) { ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); ret = nilfs_btnode_submit_block(btnc, ptr2, 0, REQ_OP_READ | REQ_RAHEAD, &ra_bh, &submit_ptr); if (likely(!ret || ret == -EEXIST)) brelse(ra_bh); else if (ret != -EBUSY) break; if (!buffer_locked(bh)) goto out_no_wait; } } wait_on_buffer(bh); out_no_wait: if (!buffer_uptodate(bh)) { nilfs_err(btree->b_inode->i_sb, "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)", btree->b_inode->i_ino, (unsigned long long)ptr); brelse(bh); return -EIO; } out_check: if (nilfs_btree_broken_node_block(bh)) { clear_buffer_uptodate(bh); brelse(bh); return -EINVAL; } *bhp = bh; return 0; } static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { return __nilfs_btree_get_block(btree, ptr, bhp, NULL); } static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, __u64 key, __u64 *ptrp, int minlevel, int readahead) { struct nilfs_btree_node *node; struct nilfs_btree_readahead_info p, *ra; __u64 ptr; int level, index, found, ncmax, ret; node = nilfs_btree_get_root(btree); level = nilfs_btree_node_get_level(node); if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) return -ENOENT; found = nilfs_btree_node_lookup(node, key, &index); ptr = nilfs_btree_node_get_ptr(node, index, NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; ncmax = nilfs_btree_nchildren_per_block(btree); while (--level >= minlevel) { ra = NULL; if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) { p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); p.index = index; p.max_ra_blocks = 7; ra = &p; } ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh, ra); if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; if (!found) found = nilfs_btree_node_lookup(node, key, &index); else index = 0; if (index < ncmax) { ptr = nilfs_btree_node_get_ptr(node, index, ncmax); } else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); /* insert */ ptr = NILFS_BMAP_INVALID_PTR; } path[level].bp_index = index; } if (!found) return -ENOENT; if (ptrp != NULL) *ptrp = ptr; return 0; } static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; __u64 ptr; int index, level, ncmax, ret; node = nilfs_btree_get_root(btree); index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; level = nilfs_btree_node_get_level(node); ptr = nilfs_btree_node_get_ptr(node, index, NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; ncmax = nilfs_btree_nchildren_per_block(btree); for (level--; level > 0; level--) { ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(node, index, ncmax); path[level].bp_index = index; } if (keyp != NULL) *keyp = nilfs_btree_node_get_key(node, index); if (ptrp != NULL) *ptrp = ptr; return 0; } /** * nilfs_btree_get_next_key - get next valid key from btree path array * @btree: bmap struct of btree * @path: array of nilfs_btree_path struct * @minlevel: start level * @nextkey: place to store the next valid key * * Return: 0 if the next key was found, %-ENOENT if not found. */ static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, int minlevel, __u64 *nextkey) { struct nilfs_btree_node *node; int maxlevel = nilfs_btree_height(btree) - 1; int index, next_adj, level; /* Next index is already set to bp_index for leaf nodes. */ next_adj = 0; for (level = minlevel; level <= maxlevel; level++) { if (level == maxlevel) node = nilfs_btree_get_root(btree); else node = nilfs_btree_get_nonroot_node(path, level); index = path[level].bp_index + next_adj; if (index < nilfs_btree_node_get_nchildren(node)) { /* Next key is in this node */ *nextkey = nilfs_btree_node_get_key(node, index); return 0; } /* For non-leaf nodes, next index is stored at bp_index + 1. */ next_adj = 1; } return -ENOENT; } static int nilfs_btree_lookup(const struct nilfs_bmap *btree, __u64 key, int level, __u64 *ptrp) { struct nilfs_btree_path *path; int ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0); nilfs_btree_free_path(path); return ret; } static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, __u64 key, __u64 *ptrp, unsigned int maxblocks) { struct nilfs_btree_path *path; struct nilfs_btree_node *node; struct inode *dat = NULL; __u64 ptr, ptr2; sector_t blocknr; int level = NILFS_BTREE_LEVEL_NODE_MIN; int ret, cnt, index, maxlevel, ncmax; struct nilfs_btree_readahead_info p; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1); if (ret < 0) goto out; if (NILFS_BMAP_USE_VBN(btree)) { dat = nilfs_bmap_get_dat(btree); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) goto dat_error; ptr = blocknr; } cnt = 1; if (cnt == maxblocks) goto end; maxlevel = nilfs_btree_height(btree) - 1; node = nilfs_btree_get_node(btree, path, level, &ncmax); index = path[level].bp_index + 1; for (;;) { while (index < nilfs_btree_node_get_nchildren(node)) { if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax); if (dat) { ret = nilfs_dat_translate(dat, ptr2, &blocknr); if (ret < 0) goto dat_error; ptr2 = blocknr; } if (ptr2 != ptr + cnt || ++cnt == maxblocks) goto end; index++; } if (level == maxlevel) break; /* look-up right sibling node */ p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); p.index = path[level + 1].bp_index + 1; p.max_ra_blocks = 7; if (p.index >= nilfs_btree_node_get_nchildren(p.node) || nilfs_btree_node_get_key(p.node, p.index) != key + cnt) break; ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax); path[level + 1].bp_index = p.index; brelse(path[level].bp_bh); path[level].bp_bh = NULL; ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh, &p); if (ret < 0) goto out; node = nilfs_btree_get_nonroot_node(path, level); ncmax = nilfs_btree_nchildren_per_block(btree); index = 0; path[level].bp_index = index; } end: *ptrp = ptr; ret = cnt; out: nilfs_btree_free_path(path); return ret; dat_error: if (ret == -ENOENT) ret = -EINVAL; /* Notify bmap layer of metadata corruption */ goto out; } static void nilfs_btree_promote_key(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 key) { if (level < nilfs_btree_height(btree) - 1) { do { nilfs_btree_node_set_key( nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); } while ((path[level].bp_index == 0) && (++level < nilfs_btree_height(btree) - 1)); } /* root */ if (level == nilfs_btree_height(btree) - 1) { nilfs_btree_node_set_key(nilfs_btree_get_root(btree), path[level].bp_index, key); } } static void nilfs_btree_do_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); nilfs_btree_node_insert(node, path[level].bp_index, *keyp, *ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_insert(node, path[level].bp_index, *keyp, *ptrp, NILFS_BTREE_ROOT_NCHILDREN_MAX); } } static void nilfs_btree_carry_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; int nchildren, lnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; if (n > path[level].bp_index) { /* move insert point */ n--; move = 1; } nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index += lnchildren; path[level + 1].bp_index--; } else { brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level].bp_index -= n; } nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } static void nilfs_btree_carry_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; int nchildren, rnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; if (n > nchildren - path[level].bp_index) { /* move insert point */ n--; move = 1; } nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index -= nilfs_btree_node_get_nchildren(node); path[level + 1].bp_index++; } else { brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } static void nilfs_btree_split(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + 1) / 2; if (n > nchildren - path[level].bp_index) { n--; move = 1; } nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(right, path[level].bp_index, *keyp, *ptrp, ncblk); *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; } else { nilfs_btree_do_insert(btree, path, level, keyp, ptrp); *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } path[level + 1].bp_index++; } static void nilfs_btree_grow(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; int n, ncblk; root = nilfs_btree_get_root(btree); child = nilfs_btree_get_sib_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(root); nilfs_btree_node_move_right(root, child, n, NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; nilfs_btree_do_insert(btree, path, level, keyp, ptrp); *keyp = nilfs_btree_node_get_key(child, 0); *ptrp = path[level].bp_newreq.bpr_ptr; } static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path) { struct nilfs_btree_node *node; int level, ncmax; if (path == NULL) return NILFS_BMAP_INVALID_PTR; /* left sibling */ level = NILFS_BTREE_LEVEL_NODE_MIN; if (path[level].bp_index > 0) { node = nilfs_btree_get_node(btree, path, level, &ncmax); return nilfs_btree_node_get_ptr(node, path[level].bp_index - 1, ncmax); } /* parent */ level = NILFS_BTREE_LEVEL_NODE_MIN + 1; if (level <= nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_node(btree, path, level, &ncmax); return nilfs_btree_node_get_ptr(node, path[level].bp_index, ncmax); } return NILFS_BMAP_INVALID_PTR; } static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, __u64 key) { __u64 ptr; ptr = nilfs_bmap_find_target_seq(btree, key); if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; ptr = nilfs_btree_find_near(btree, path); if (ptr != NILFS_BMAP_INVALID_PTR) /* near */ return ptr; /* block group */ return nilfs_bmap_find_target_in_group(btree); } static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, __u64 key, __u64 ptr, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, level, ncmax, ncblk, ret; struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ if (NILFS_BMAP_USE_VBN(btree)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); dat = nilfs_bmap_get_dat(btree); } ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; ncblk = nilfs_btree_nchildren_per_block(btree); for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); if (nilfs_btree_node_get_nchildren(node) < ncblk) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; } parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; /* left sibling */ if (pindex > 0) { sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; goto out; } else { brelse(bh); } } /* right sibling */ if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; goto out; } else { brelse(bh); } } /* split */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, &bh); if (ret < 0) goto err_out_curr_node; stats->bs_nblocks++; sib = (struct nilfs_btree_node *)bh->b_data; nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_split; } /* root */ node = nilfs_btree_get_root(btree); if (nilfs_btree_node_get_nchildren(node) < NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; } /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, &bh); if (ret < 0) goto err_out_curr_node; nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data, 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_grow; level++; path[level].bp_op = nilfs_btree_do_insert; /* a newly-created node block and a data block are added */ stats->bs_nblocks += 2; /* success */ out: *levelp = level; return ret; /* error */ err_out_curr_node: nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); } nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; return ret; } static void nilfs_btree_commit_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { struct inode *dat = NULL; int level; set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; if (NILFS_BMAP_USE_VBN(btree)) { nilfs_bmap_set_target_v(btree, key, ptr); dat = nilfs_bmap_get_dat(btree); } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_alloc_ptr(btree, &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); } static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr) { struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; int level, ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret != -ENOENT) { if (ret == 0) ret = -EEXIST; goto out; } ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); if (ret < 0) goto out; nilfs_btree_commit_insert(btree, path, level, key, ptr); nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } static void nilfs_btree_do_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); nilfs_btree_node_delete(node, path[level].bp_index, keyp, ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_delete(node, path[level].bp_index, keyp, ptrp, NILFS_BTREE_ROOT_NCHILDREN_MAX); } } static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; int nchildren, lnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + lnchildren) / 2 - nchildren; nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level].bp_index += n; } static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; int nchildren, rnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + rnchildren) / 2 - nchildren; nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } static void nilfs_btree_concat_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(node); nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index += nilfs_btree_node_get_nchildren(left); } static void nilfs_btree_concat_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(right); nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level + 1].bp_index++; } static void nilfs_btree_shrink(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); root = nilfs_btree_get_root(btree); child = nilfs_btree_get_nonroot_node(path, level); ncblk = nilfs_btree_nchildren_per_block(btree); nilfs_btree_node_delete(root, 0, NULL, NULL, NILFS_BTREE_ROOT_NCHILDREN_MAX); nilfs_btree_node_set_level(root, level); n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(root, child, n, NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; } static void nilfs_btree_nop(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { } static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, struct nilfs_bmap_stats *stats, struct inode *dat) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, dindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); ncblk = nilfs_btree_nchildren_per_block(btree); for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(node, dindex, ncblk); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; if (nilfs_btree_node_get_nchildren(node) > ncmin) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; } parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; dindex = pindex; if (pindex > 0) { /* left sibling */ sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; goto out; } else { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_concat_left; stats->bs_nblocks++; /* continue; */ } } else if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; goto out; } else { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_concat_right; stats->bs_nblocks++; /* * When merging right sibling node * into the current node, pointer to * the right sibling node must be * terminated instead. The adjustment * below is required for that. */ dindex = pindex + 1; /* continue; */ } } else { /* no siblings */ /* the only child of the root node */ WARN_ON(level != nilfs_btree_height(btree) - 2); if (nilfs_btree_node_get_nchildren(node) - 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; level++; path[level].bp_op = nilfs_btree_nop; goto shrink_root_child; } else { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; } } } /* child of the root node is deleted */ path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; shrink_root_child: node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(node, dindex, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; /* success */ out: *levelp = level; return ret; /* error */ err_out_curr_node: nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; return ret; } static void nilfs_btree_commit_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); } static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key) { struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; struct inode *dat; int level, ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret < 0) goto out; dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; nilfs_btree_commit_delete(btree, path, level, dat); nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start, __u64 *keyp) { struct nilfs_btree_path *path; const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN; int ret; path = nilfs_btree_alloc_path(); if (!path) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0); if (!ret) *keyp = start; else if (ret == -ENOENT) ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp); nilfs_btree_free_path(path); return ret; } static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) { struct nilfs_btree_path *path; int ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); nilfs_btree_free_path(path); return ret; } static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) { struct buffer_head *bh; struct nilfs_btree_node *root, *node; __u64 maxkey, nextmaxkey; __u64 ptr; int nchildren, ret; root = nilfs_btree_get_root(btree); nchildren = nilfs_btree_node_get_nchildren(root); if (unlikely(nchildren == 0)) return 0; switch (nilfs_btree_height(btree)) { case 2: bh = NULL; node = root; break; case 3: if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; nchildren = nilfs_btree_node_get_nchildren(node); break; default: return 0; } maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } static int nilfs_btree_gather_data(struct nilfs_bmap *btree, __u64 *keys, __u64 *ptrs, int nitems) { struct buffer_head *bh; struct nilfs_btree_node *node, *root; __le64 *dkeys; __le64 *dptrs; __u64 ptr; int nchildren, ncmax, i, ret; root = nilfs_btree_get_root(btree); switch (nilfs_btree_height(btree)) { case 2: bh = NULL; node = root; ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX; break; case 3: nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; ncmax = nilfs_btree_nchildren_per_block(btree); break; default: node = NULL; return -EINVAL; } nchildren = nilfs_btree_node_get_nchildren(node); if (nchildren < nitems) nitems = nchildren; dkeys = nilfs_btree_node_dkeys(node); dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nitems; i++) { keys[i] = le64_to_cpu(dkeys[i]); ptrs[i] = le64_to_cpu(dptrs[i]); } brelse(bh); return nitems; } static int nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, union nilfs_bmap_ptr_req *dreq, union nilfs_bmap_ptr_req *nreq, struct buffer_head **bhp, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; struct inode *dat = NULL; int ret; stats->bs_nblocks = 0; /* for data */ /* cannot find near ptr */ if (NILFS_BMAP_USE_VBN(btree)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); dat = nilfs_bmap_get_dat(btree); } ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode); if (ret < 0) return ret; ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; *bhp = NULL; stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat); if (ret < 0) goto err_out_dreq; ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh); if (ret < 0) goto err_out_nreq; *bhp = bh; stats->bs_nblocks++; } /* success */ return 0; /* error */ err_out_nreq: nilfs_bmap_abort_alloc_ptr(btree, nreq, dat); err_out_dreq: nilfs_bmap_abort_alloc_ptr(btree, dreq, dat); stats->bs_nblocks = 0; return ret; } static void nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n, union nilfs_bmap_ptr_req *dreq, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { struct nilfs_btree_node *node; struct inode *dat; __u64 tmpptr; int ncblk; /* free resources */ if (btree->b_ops->bop_clear != NULL) btree->b_ops->bop_clear(btree); /* ptr must be a pointer to a buffer head. */ set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; __nilfs_btree_init(btree); if (nreq != NULL) { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); /* create child node at level 1 */ node = (struct nilfs_btree_node *)bh->b_data; ncblk = nilfs_btree_nchildren_per_block(btree); nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); if (!buffer_dirty(bh)) mark_buffer_dirty(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); brelse(bh); /* create root node at level 2 */ node = nilfs_btree_get_root(btree); tmpptr = nreq->bpr_ptr; nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1, NILFS_BTREE_ROOT_NCHILDREN_MAX, &keys[0], &tmpptr); } else { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n, NILFS_BTREE_ROOT_NCHILDREN_MAX, keys, ptrs); nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, NILFS_BTREE_ROOT_NCHILDREN_MAX); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); } if (NILFS_BMAP_USE_VBN(btree)) nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr); } /** * nilfs_btree_convert_and_insert - Convert and insert entries into a B-tree * @btree: NILFS B-tree structure * @key: Key of the new entry to be inserted * @ptr: Pointer (block number) associated with the key to be inserted * @keys: Array of keys to be inserted in addition to @key * @ptrs: Array of pointers associated with @keys * @n: Number of keys and pointers in @keys and @ptrs * * This function is used to insert a new entry specified by @key and @ptr, * along with additional entries specified by @keys and @ptrs arrays, into a * NILFS B-tree. * It prepares the necessary changes by allocating the required blocks and any * necessary intermediate nodes. It converts configurations from other forms of * block mapping (the one that currently exists is direct mapping) to a B-tree. * * Return: 0 on success or a negative error code on failure. */ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { struct buffer_head *bh = NULL; union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; struct nilfs_bmap_stats stats; int ret; if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { di = &dreq; ni = NULL; } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( nilfs_btree_node_size(btree))) { di = &dreq; ni = &nreq; } else { di = NULL; ni = NULL; BUG(); } ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh, &stats); if (ret < 0) return ret; nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, di, ni, bh); nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks); return 0; } static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) { while ((++level < nilfs_btree_height(btree) - 1) && !buffer_dirty(path[level].bp_bh)) mark_buffer_dirty(path[level].bp_bh); return 0; } static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; int ncmax, ret; parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, ncmax); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req); if (ret < 0) return ret; if (buffer_nilfs_node(path[level].bp_bh)) { path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req); return ret; } } return 0; } static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; int ncmax; nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req, btree->b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } set_buffer_nilfs_volatile(path[level].bp_bh); parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, path[level].bp_newreq.bpr_ptr, ncmax); } static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); } static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int *maxlevelp, struct inode *dat) { int level, ret; level = minlevel; if (!buffer_nilfs_volatile(path[level].bp_bh)) { ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) return ret; } while ((++level < nilfs_btree_height(btree) - 1) && !buffer_dirty(path[level].bp_bh)) { WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) goto out; } /* success */ *maxlevelp = level - 1; return 0; /* error */ out: while (--level > minlevel) nilfs_btree_abort_update_v(btree, path, level, dat); if (!buffer_nilfs_volatile(path[level].bp_bh)) nilfs_btree_abort_update_v(btree, path, level, dat); return ret; } static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int maxlevel, struct buffer_head *bh, struct inode *dat) { int level; if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) nilfs_btree_commit_update_v(btree, path, minlevel, dat); for (level = minlevel + 1; level <= maxlevel; level++) nilfs_btree_commit_update_v(btree, path, level, dat); } static int nilfs_btree_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) { int maxlevel = 0, ret; struct nilfs_btree_node *parent; struct inode *dat = nilfs_bmap_get_dat(btree); __u64 ptr; int ncmax; get_bh(bh); path[level].bp_bh = bh; ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, dat); if (ret < 0) goto out; if (buffer_nilfs_volatile(path[level].bp_bh)) { parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, ncmax); ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; } nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); out: brelse(path[level].bp_bh); path[level].bp_bh = NULL; return ret; } static int nilfs_btree_propagate(struct nilfs_bmap *btree, struct buffer_head *bh) { struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; int level, ret; WARN_ON(!buffer_dirty(bh)); path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(btree, bh); level = NILFS_BTREE_LEVEL_DATA; } ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { if (unlikely(ret == -ENOENT)) nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); goto out; } ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_propagate_v(btree, path, level, bh) : nilfs_btree_propagate_p(btree, path, level, bh); out: nilfs_btree_free_path(path); return ret; } static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree, struct buffer_head *bh) { return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr); } static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, struct list_head *lists, struct buffer_head *bh) { struct list_head *head; struct buffer_head *cbh; struct nilfs_btree_node *node, *cnode; __u64 key, ckey; int level; get_bh(bh); node = (struct nilfs_btree_node *)bh->b_data; key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); if (level < NILFS_BTREE_LEVEL_NODE_MIN || level >= NILFS_BTREE_LEVEL_MAX) { dump_stack(); nilfs_warn(btree->b_inode->i_sb, "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)", level, (unsigned long long)key, btree->b_inode->i_ino, (unsigned long long)bh->b_blocknr); return; } list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; ckey = nilfs_btree_node_get_key(cnode, 0); if (key < ckey) break; } list_add_tail(&bh->b_assoc_buffers, head); } static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct folio_batch fbatch; struct buffer_head *bh, *head; pgoff_t index = 0; int level, i; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < NILFS_BTREE_LEVEL_MAX; level++) INIT_LIST_HEAD(&lists[level]); folio_batch_init(&fbatch); while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh)) nilfs_btree_add_dirty_buffer(btree, lists, bh); } while ((bh = bh->b_this_page) != head); } folio_batch_release(&fbatch); cond_resched(); } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < NILFS_BTREE_LEVEL_MAX; level++) list_splice_tail(&lists[level], listp); } static int nilfs_btree_assign_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; __u64 key; __u64 ptr; int ncmax, ret; parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, ncmax); if (buffer_nilfs_node(*bh)) { path[level].bp_ctxt.oldkey = ptr; path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr, ncmax); key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = level; memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad)); return 0; } static int nilfs_btree_assign_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; struct inode *dat = nilfs_bmap_get_dat(btree); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; int ncmax, ret; parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, ncmax); req.bpr_ptr = ptr; ret = nilfs_dat_prepare_start(dat, &req.bpr_req); if (ret < 0) return ret; nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } static int nilfs_btree_assign(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; int level, ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(btree, *bh); level = NILFS_BTREE_LEVEL_DATA; } ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; } ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: nilfs_btree_free_path(path); return ret; } static int nilfs_btree_assign_gc(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { struct nilfs_btree_node *node; __u64 key; int ret; ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr, blocknr); if (ret < 0) return ret; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; key = nilfs_btree_node_get_key(node, 0); } else key = nilfs_bmap_data_get_key(btree, *bh); /* on-disk format */ binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) { struct buffer_head *bh; struct nilfs_btree_path *path; __u64 ptr; int ret; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; } ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; } if (!buffer_dirty(bh)) mark_buffer_dirty(bh); brelse(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); out: nilfs_btree_free_path(path); return ret; } static const struct nilfs_bmap_operations nilfs_btree_ops = { .bop_lookup = nilfs_btree_lookup, .bop_lookup_contig = nilfs_btree_lookup_contig, .bop_insert = nilfs_btree_insert, .bop_delete = nilfs_btree_delete, .bop_clear = NULL, .bop_propagate = nilfs_btree_propagate, .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, .bop_assign = nilfs_btree_assign, .bop_mark = nilfs_btree_mark, .bop_seek_key = nilfs_btree_seek_key, .bop_last_key = nilfs_btree_last_key, .bop_check_insert = NULL, .bop_check_delete = nilfs_btree_check_delete, .bop_gather_data = nilfs_btree_gather_data, }; static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_lookup = NULL, .bop_lookup_contig = NULL, .bop_insert = NULL, .bop_delete = NULL, .bop_clear = NULL, .bop_propagate = nilfs_btree_propagate_gc, .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, .bop_assign = nilfs_btree_assign_gc, .bop_mark = NULL, .bop_seek_key = NULL, .bop_last_key = NULL, .bop_check_insert = NULL, .bop_check_delete = NULL, .bop_gather_data = NULL, }; static void __nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); } int nilfs_btree_init(struct nilfs_bmap *bmap) { int ret = 0; __nilfs_btree_init(bmap); if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; else ret = nilfs_attach_btree_node_cache( &NILFS_BMAP_I(bmap)->vfs_inode); return ret; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops_gc; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); }
876 1 1 869 38 832 833 831 243 4 240 241 2 3 839 8 838 834 180 837 839 837 837 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ device_links_read_lock_held()) /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static int async_error; static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = from_timer(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; del_timer_sync(timer); destroy_timer_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool dpm_async_fn(struct device *dev, async_func_t func) { reinit_completion(&dev->power.completion); if (is_async(dev)) { dev->power.async_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); } /* * Because async_schedule_dev_nocall() above has returned false or it * has not been called at all, func() is not running and it is safe to * update the async_in_progress flag without extra synchronization. */ dev->power.async_in_progress = false; return false; } /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.set_active flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) { pm_runtime_set_suspended(dev); } else if (dev->power.set_active) { pm_runtime_set_active(dev); dev->power.set_active = false; } if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) dpm_async_fn(dev, async_resume_noirq); while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) dpm_async_fn(dev, async_resume_early); while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (dev->power.direct_complete) { /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (!dev->power.is_suspended) goto Unlock; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; Unlock: device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) dpm_async_fn(dev, async_resume); while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (async_error) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev, bool set_active) { struct device_link *link; int idx; if (dev->parent) { dev->parent->power.must_resume = true; if (set_active) dev->parent->power.set_active = true; } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { link->supplier->power.must_resume = true; if (set_active) link->supplier->power.set_active = true; } device_links_read_unlock(idx); } /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Skipping the resume of devices that were in use right before the * system suspend (as indicated by their PM-runtime usage counters) * would be suboptimal. Also resume them if doing that is not allowed * to be skipped. */ if (atomic_read(&dev->power.usage_count) > 1 || !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume)) dev->power.must_resume = true; if (dev->power.must_resume) { dev->power.set_active = dev->power.set_active || dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); dpm_superior_set_must_resume(dev, dev->power.set_active); } Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (pm_wakeup_pending()) { async_error = -EBUSY; goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else error = dpm_suspend(state); dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && pm_runtime_status_suspended(dev); }
937 936 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BSEARCH_H #define _LINUX_BSEARCH_H #include <linux/types.h> static __always_inline void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { const char *pivot; int result; while (num > 0) { pivot = base + (num >> 1) * size; result = cmp(key, pivot); if (result == 0) return (void *)pivot; if (result > 0) { base = pivot + size; num--; } num >>= 1; } return NULL; } extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */
2023 2023 2017 2025 2023 2023 2020 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/atomic.h> #include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> #include <net/inet_dscp.h> #include "dev.h" /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { if (in_compat_syscall()) { struct compat_sock_fprog f32; if (len != sizeof(f32)) return -EINVAL; if (copy_from_sockptr(&f32, src, sizeof(f32))) return -EFAULT; memset(dst, 0, sizeof(*dst)); dst->len = f32.len; dst->filter = compat_ptr(f32.filter); } else { if (len != sizeof(*dst)) return -EINVAL; if (copy_from_sockptr(dst, src, sizeof(*dst))) return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return *(u8 *)ptr; } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be16(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be32(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, vlan_all)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); if (offset) *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it won't be used at * this point in time anymore internally after the migration to the eBPF * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the eBPF translation * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { #ifdef CONFIG_DEBUG_NET /* Avoid a splat in pskb_may_pull_reason() */ if (write_len > INT_MAX) return -EINVAL; #endif return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return ____bpf_skb_store_bytes(skb, offset, from, len, flags); } BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return ____bpf_skb_load_bytes(skb, offset, to, len); } BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > 0xffff)) goto err_clear; if (unlikely(!ctx->skb)) goto err_clear; ptr = skb_header_pointer(ctx->skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { .func = bpf_flow_dissector_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } static const struct bpf_func_proto bpf_sk_fullsock_proto = { .func = bpf_sk_fullsock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ __wsum ret = seed; if (from_size && to_size) ret = csum_sub(csum_partial(to, to_size, ret), csum_partial(from, from_size, 0)); else if (to_size) ret = csum_partial(to, to_size, ret); else if (from_size) ret = ~csum_partial(from, from_size, ~ret); return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) { /* The interface is to be used in combination with bpf_skb_adjust_room() * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET * is passed as flags, for example. */ switch (level) { case BPF_CSUM_LEVEL_INC: __skb_incr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_DEC: __skb_decr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_RESET: __skb_reset_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_QUERY: return skb->ip_summed == CHECKSUM_UNNECESSARY ? skb->csum_level : -EACCES; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_csum_level_proto = { .func = bpf_csum_level, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (unlikely(skb->len <= mlen)) { kfree_skb(skb); return -ERANGE; } if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } #if IS_ENABLED(CONFIG_IPV6) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { dst = skb_dst(skb); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowi6_mark = skb->mark, .flowlabel = ip6_flowinfo(ip6h), .flowi6_oif = dev->ifindex, .flowi6_proto = ip6h->nexthdr, .daddr = ip6h->daddr, .saddr = ip6h->saddr, }; dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) goto out_drop; skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; } err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { rcu_read_unlock(); goto out_drop; } if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; struct rtable *rt; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out_drop; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto out_drop; } skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); if (unlikely(skb->mac_header >= skb->network_header)) goto out; bpf_push_mac_rcsum(skb); if (is_multicast_ether_addr(ethh->h_dest)) goto out; skb_pull(skb, sizeof(*ethh)); skb_unset_mac_header(skb); skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; } /* Internal, non-exposed redirect flags. */ enum { BPF_F_NEIGH = (1ULL << 16), BPF_F_PEER = (1ULL << 17), BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; if (likely(ops->ndo_get_peer_dev)) return INDIRECT_CALL_1(ops->ndo_get_peer_dev, netkit_peer_dev, dev); return NULL; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; ri->flags = 0; if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; ri->flags = BPF_F_PEER; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_peer_proto = { .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); if (plen) memcpy(&ri->nh, params, sizeof(ri->nh)); return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_neigh_proto = { .func = bpf_redirect_neigh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static void sk_msg_reset_curr(struct sk_msg *msg) { if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else { u32 i = msg->sg.end; sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, u32, end, u64, flags) { u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; u32 first_sge, last_sge, i, shift, bytes_sg_total; struct scatterlist *sge; u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg.end); last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; raw = page_address(page); i = first_sge; do { sge = sk_msg_elem(msg, i); from = sg_virt(sge); len = sge->length; to = raw + poffset; memcpy(to, from, len); poffset += len; sge->length = 0; put_page(sg_page(sge)); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; i = first_sge; sk_msg_iter_var_next(i); do { u32 move_from; if (i + shift >= NR_MSG_FRAG_IDS) move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) break; msg->sg.data[i] = msg->sg.data[move_from]; msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); /* If no space available will fallback to copy, we need at * least one scatterlist elem available to push data into * when start aligns to the beginning of an element or two * when it falls inside an element. We handle the start equals * offset case because its the common case for inserting a * header. */ if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) return -ENOMEM; if (copy) { int front, back; raw = page_address(page); if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; from = sg_virt(psge); if (front) memcpy(raw, from, front); if (back) { from += front; to = raw + front + len; memcpy(to, from, back); } put_page(sg_page(psge)); new = i; goto place_new; } if (start - offset) { if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); psge->length = start - offset; rsge.length -= psge->length; rsge.offset += start; sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); } /* Slot(s) to place newly allocated data */ sk_msg_iter_next(msg, end); new = i; sk_msg_iter_var_next(i); if (i == msg->sg.end) { if (!rsge.length) goto place_new; sk_msg_iter_next(msg, end); goto place_new; } /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); sg_unmark_end(&sge); nsge = sk_msg_elem_cpy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; sge = nsge; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); } } place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; } sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_push_data_proto = { .func = bpf_msg_push_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static void sk_msg_shift_left(struct sk_msg *msg, int i) { struct scatterlist *sge = sk_msg_elem(msg, i); int prev; put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; sk_msg_iter_var_next(i); sge = tmp; tmp = sk_msg_elem_cpy(msg, i); } } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); pop = len; /* --------------| offset * -| start |-------- len -------| * * |----- a ----|-------- pop -------|----- b ----| * |______________________________________________| length * * * a: region at front of scatter element to save * b: region at back of scatter element to save when length > A + pop * pop: region to pop from element, same as input 'pop' here will be * decremented below per iteration. * * Two top-level cases to handle when start != offset, first B is non * zero and second B is zero corresponding to when a pop includes more * than one element. * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); nsge = sk_msg_elem(msg, i); get_page(sg_page(sge)); sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); } else { struct page *page, *orig; u8 *to, *from; page = alloc_pages(__GFP_NOWARN | __GFP_COMP | GFP_ATOMIC, get_order(a + b)); if (unlikely(!page)) return -ENOMEM; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); put_page(orig); } pop = 0; } else { pop -= (sge->length - a); sge->length = a; } } /* From above the current layout _must_ be as follows, * * -| offset * -| start * * |---- pop ---|---------------- b ------------| * |____________________________________________| length * * Offset and start of the current msg elem are equal because in the * previous case we handled offset != start and either consumed the * entire element and advanced to the next element OR pop == 0. * * Two cases to handle here are first pop is less than the length * leaving some remainder b above. Simply adjust the element's layout * in this case. Or pop >= length of the element so that b = 0. In this * case advance to next element decrementing pop. */ while (pop) { struct scatterlist *sge = sk_msg_elem(msg, i); if (pop < sge->length) { sge->length -= pop; sge->offset += pop; pop = 0; } else { pop -= sge->length; sk_msg_shift_left(msg, i); } } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_pop_data_proto = { .func = bpf_msg_pop_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; #ifdef CONFIG_CGROUP_NET_CLASSID BPF_CALL_0(bpf_get_cgroup_classid_curr) { return __task_get_classid(current); } const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return 0; return sock_cgroup_classid(&sk->sk_cgrp_data); } static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { .func = bpf_skb_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, * so no need to do it here. */ skb_push(skb, len); memmove(skb->data, skb->data + len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { void *old_data; /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); memmove(skb->data, old_data, off); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } } skb->protocol = htons(ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } } skb->protocol = htons(ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -ENOTSUPP; if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && inner_mac_len < ETH_HLEN) return -EINVAL; if (skb->encapsulation) return -EALREADY; mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; if (inner_mac_len > len_diff) return -EINVAL; inner_trans = skb->transport_header; } ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) skb_set_inner_protocol(skb, htons(ETH_P_TEB)); else skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) gso_type |= SKB_GSO_UDP_TUNNEL; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); skb_set_transport_header(skb, mac_len + nh_len); } /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. * There is a BUG_ON() when segmenting the frag_list with * head_frag true, so linearize the skb after downgrading * the MSS. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { skb_decrease_gso_size(shinfo, len_diff); if (shinfo->frag_list) return skb_linearize(skb); } } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_diff_abs = abs(len_diff); bool shrink = len_diff < 0; int ret = 0; if (unlikely(flags || mode)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (!shrink) { ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; __skb_push(skb, len_diff_abs); memset(skb->data, 0, len_diff_abs); } else { if (unlikely(!pskb_may_pull(skb, len_diff_abs))) return -ENOMEM; __skb_pull(skb, len_diff_abs); } if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); rxm->full_len += len_diff; } return ret; } static const struct bpf_func_proto sk_skb_adjust_room_proto = { .func = sk_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: off += bpf_skb_net_base_len(skb); break; case BPF_ADJ_ROOM_MAC: break; default: return -ENOTSUPP; } if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { if (!shrink) return -EINVAL; switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: len_min = sizeof(struct iphdr); break; case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: len_min = sizeof(struct ipv6hdr); break; default: return -EINVAL; } } len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { int offset = skb_network_offset(skb); u32 min_len = 0; if (offset > 0) min_len = offset; if (skb_transport_header_was_set(skb)) { offset = skb_transport_offset(skb); if (offset > 0) min_len = offset; } if (skb->ip_summed == CHECKSUM_PARTIAL) { offset = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); if (offset > 0) min_len = offset; } return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; struct skb_shared_info *sinfo; void *src, *dst; u8 *ptr_buf; if (likely(xdp->data_end - xdp->data >= off + len)) { src = flush ? buf : xdp->data + off; dst = flush ? xdp->data + off : buf; memcpy(dst, src, len); return; } sinfo = xdp_get_shared_info_from_buff(xdp); end_frag = &sinfo->frags[sinfo->nr_frags]; next_frag = &sinfo->frags[0]; ptr_len = xdp->data_end - xdp->data; ptr_buf = xdp->data; while (true) { if (off < ptr_off + ptr_len) { unsigned long copy_off = off - ptr_off; unsigned long copy_len = min(len, ptr_len - copy_off); src = flush ? buf : ptr_buf + copy_off; dst = flush ? ptr_buf + copy_off : buf; memcpy(dst, src, copy_len); off += copy_len; len -= copy_len; buf += copy_len; } if (!len || next_frag == end_frag) break; ptr_off += ptr_len; ptr_buf = skb_frag_address(next_frag); ptr_len = skb_frag_size(next_frag); next_frag++; } } void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { u32 size = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); if (likely(offset < size)) /* linear area */ goto out; sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); if (offset < frag_size) { addr = skb_frag_address(&sinfo->frags[i]); size = frag_size; break; } offset -= frag_size; } out: return offset + len <= size ? addr + offset : NULL; } BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, false); else memcpy(buf, ptr, len); return 0; } static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .func = bpf_xdp_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_load_bytes(xdp, offset, buf, len); } BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, true); else memcpy(ptr, buf, len); return 0; } static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .func = bpf_xdp_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_store_bytes(xdp, offset, buf, len); } static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; unsigned int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); if (unlikely(offset > tailroom)) return -EINVAL; memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) xsk_buff_get_tail(xdp)->data_end += offset; return 0; } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } if (release) __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; } static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, n_frags_free = 0, len_free = 0; if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) return -EINVAL; for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { skb_frag_t *frag = &sinfo->frags[i]; int shrink = min_t(int, offset, skb_frag_size(frag)); len_free += shrink; offset -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); break; } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); xdp->data_end -= offset; } return 0; } BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ if (offset < 0) return bpf_xdp_frags_shrink_tail(xdp, -offset); return bpf_xdp_frags_increase_tail(xdp, offset); } /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; /* Clear memory area on grow, can contain uninit kernel memory */ if (offset > 0) memset(xdp->data_end, 0, offset); xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; /** * DOC: xdp redirect * * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target * of the redirect and store it (along with some other metadata) in a per-CPU * struct bpf_redirect_info. * * 2. When the program returns the XDP_REDIRECT return code, the driver will * call xdp_do_redirect() which will use the information in struct * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * * 3. Before exiting its NAPI poll loop, the driver will call * xdp_do_flush(), which will flush all the different bulk queues, * thus completing the redirect. Note that xdp_do_flush() must be * called before napi_complete_done() in the driver, as the * XDP_REDIRECT logic relies on being inside a single NAPI instance * through to the xdp_do_flush() call for RCU protection of all * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening * inside a single NAPI poll sequence, which means it's between a pair of calls * to local_bh_disable()/local_bh_enable(). * * The map entries are marked as __rcu and the map code makes sure to * dereference those pointers with rcu_dereference_check() in a way that works * for both sections that to hold an rcu_read_lock() and sections that are * called from NAPI without a separate rcu_read_lock(). The code below does not * use RCU annotations, but relies on those in the map code. */ void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) __dev_flush(lh_dev); if (lh_map) __cpu_map_flush(lh_map); if (lh_xsk) __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) { __dev_flush(lh_dev); missed = true; } if (lh_map) { __cpu_map_flush(lh_map); missed = true; } if (lh_xsk) { __xsk_map_flush(lh_xsk); missed = true; } WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); if (slave && slave != xdp->rxq->dev) { /* The target device is different from the receiving device, so * redirect it to the new device. * Using XDP_REDIRECT gets the correct behaviour from XDP enabled * drivers to unmap the packet from their rx ring. */ ri->tgt_index = slave->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } return XDP_TX; } EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, const struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; err = __xsk_map_redirect(fwd, xdp); if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; } switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; break; } err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; default: err = -EBADRQC; } if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog, void *fwd, enum bpf_map_type map_type, u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } if (unlikely(err)) goto err; break; case BPF_MAP_TYPE_XSKMAP: err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_generic_redirect(fwd, skb); if (unlikely(err)) goto err; break; default: err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); generic_xdp_tx(skb, xdp_prog); return 0; } return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; /* NB! Map type UNSPEC and map_id == INT_MAX (never generated * by map_idr) is used for ifindex based XDP redirect. */ ri->tgt_index = ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); memcpy(to->local_ipv6, &info->key.u.ipv6.dst, sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, flags & BPF_F_DONT_FRAGMENT); __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, !(flags & BPF_F_ZERO_CSUM_TX)); __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, flags & BPF_F_SEQ_NUMBER); __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); memcpy(&info->key.u.ipv6.src, from->local_ipv6, sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_set_options_present(present); ip_tunnel_info_opts_set(info, from, size, present); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, int ancestor_level) { struct cgroup *ancestor; struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return cgroup_id(ancestor); } BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) { return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); } static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { struct xdp_buff *xdp = (struct xdp_buff *)ctx; bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .func = bpf_get_socket_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) { return sk ? sock_gen_cookie(sk) : 0; } const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static u64 __bpf_get_netns_cookie(struct sock *sk) { const struct net *net = sk ? sock_net(sk) : &init_net; return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) { return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_proto = { .func = bpf_get_netns_cookie, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { .func = bpf_get_netns_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .func = bpf_get_netns_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .func = bpf_get_netns_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { .func = bpf_get_netns_cookie_sk_msg, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { switch (optname) { case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: case SO_PRIORITY: case SO_REUSEPORT: case SO_RCVLOWAT: case SO_MARK: case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: break; default: return -EINVAL; } if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; return sk_getsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return sk_setsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; int val; if (optlen != sizeof(int)) return -EINVAL; val = *(int *)optval; /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); if (timeout > TCP_DELACK_MAX || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; case TCP_BPF_RTO_MIN: timeout = usecs_to_jiffies(val); if (timeout > TCP_RTO_MIN || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) return -EINVAL; tp->bpf_sock_ops_cb_flags = val; break; default: return -EINVAL; } return 0; } static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { struct tcp_sock *tp; int ret; if (*optlen < 2) return -EINVAL; if (getopt) { if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; /* BPF expects NULL-terminated tcp-cc string */ optval[--(*optlen)] = '\0'; return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } /* "cdg" is the only cc that alloc a ptr * in inet_csk_ca area. The bpf-tcp-cc may * overwrite this ptr after switching to cdg. */ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; /* It stops this looping * * .init => bpf_setsockopt(tcp_cc) => .init => * bpf_setsockopt(tcp_cc)" => .init => .... * * The second bpf_setsockopt(tcp_cc) is not allowed * in order to break the loop when both .init * are the same bpf prog. * * This applies even the second bpf_setsockopt(tcp_cc) * does not cause a loop. This limits only the first * '.init' can call bpf_setsockopt(TCP_CONGESTION) to * pick a fallback cc (eg. peer does not support ECN) * and the second '.init' cannot fallback to * another. */ tp = tcp_sk(sk); if (tp->bpf_chg_cc_inprogress) return -EBUSY; tp->bpf_chg_cc_inprogress = 1; ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); tp->bpf_chg_cc_inprogress = 0; return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { case TCP_NODELAY: case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; if (getopt) { struct tcp_sock *tp = tcp_sk(sk); int cb_flags = tp->bpf_sock_ops_cb_flags; memcpy(optval, &cb_flags, *optlen); return 0; } return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); default: if (getopt) return -EINVAL; return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } if (getopt) { if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->saved_syn || *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; memcpy(optval, tp->saved_syn->data, *optlen); /* It cannot free tp->saved_syn here because it * does not know if the user space still needs it. */ return 0; } return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return do_ip_getsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return do_ip_setsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int err, saved_optlen = optlen; if (!sk_fullsock(sk)) { err = -EINVAL; goto done; } if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); else err = -EINVAL; done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_getsockopt(sk, level, optname, optval, optlen); } BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_setsockopt_proto = { .func = bpf_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_getsockopt_proto = { .func = bpf_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { .func = bpf_unlocked_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { .func = bpf_unlocked_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .func = bpf_sock_addr_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .func = bpf_sock_addr_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .func = bpf_sock_ops_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { struct sk_buff *syn_skb = bpf_sock->syn_skb; const u8 *hdr_start; int ret; if (syn_skb) { /* sk is a request_sock here */ if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } else { /* optname == TCP_BPF_SYN_MAC */ hdr_start = skb_mac_header(syn_skb); ret = skb_mac_header_len(syn_skb) + skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; struct saved_syn *saved_syn; if (sk->sk_state == TCP_NEW_SYN_RECV) /* synack retransmit. bpf_sock->syn_skb will * not be available. It has to resort to * saved_syn (if it is saved). */ saved_syn = inet_reqsk(sk)->saved_syn; else saved_syn = tcp_sk(sk)->saved_syn; if (!saved_syn) return -ENOENT; if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; } else if (optname == TCP_BPF_SYN_IP) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen; ret = saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } else { /* optname == TCP_BPF_SYN_MAC */ /* TCP_SAVE_SYN may not have saved the mac hdr */ if (!saved_syn->mac_hdrlen) return -ENOENT; hdr_start = saved_syn->data; ret = saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } *start = hdr_start; return ret; } BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); if (ret > 0) { copy_len = ret; if (optlen < copy_len) { copy_len = optlen; ret = -ENOSPC; } memcpy(optval, start, copy_len); } /* Zero out unused buffer at the end */ memset(optval + copy_len, 0, optlen - copy_len); return ret; } return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; u32 flags = BIND_FROM_BPF; int err; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) struct metadata_dst __percpu *xfrm_bpf_md_dst; EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); #endif BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) params->mtu_result = mtu; /* union with tot_len */ return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct flowi4 fl4; u32 mtu = 0; int err; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_tos = params->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl4.flowi4_mark = params->mark; else fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } nhc = res.nhc; /* do not handle lwt encaps right now */ if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) params->ipv4_src = fib_result_prefsrc(net, &res); /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; if (likely(nhc->nhc_gw_family != AF_INET6)) neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); else neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct flowi6 fl6; int strict = 0; int oif, err; u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl6.flowi6_mark = params->mark; else fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; switch (res.fib6_type) { /* only unicast is forwarded */ case RTN_UNICAST: break; case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (res.nh->fib_nh_gw_family) *dst = res.nh->fib_nh_gw6; dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) { if (res.f6i->fib6_prefsrc.plen) { *src = res.f6i->fib6_prefsrc.addr; } else { err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src); if (err) return BPF_FIB_LKUP_RET_NO_SRC_ADDR; } } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, u32 ifindex) { struct net *netns = dev_net(dev_curr); /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ if (ifindex == 0) return dev_curr; return dev_get_by_index_rcu(netns, ifindex); } BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; } /* At this point, skb->len exceed MTU, but as it include length of all * segments, it can still be below MTU. The SKB can possibly get * re-segmented in transmit path (see validate_xmit_skb). Thus, user * must choose if segs are to be MTU checked. */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; if (flags & BPF_MTU_CHK_SEGS && !skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } out: *mtu_len = mtu; return ret; } BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { struct net_device *dev = xdp->rxq->dev; int xdp_len = xdp->data_end - xdp->data; int ret = BPF_MTU_CHK_RET_SUCCESS; int mtu, dev_len; /* XDP variant doesn't support multi-buffer segment check (yet) */ if (unlikely(flags)) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ if (*mtu_len) xdp_len = *mtu_len + dev->hard_header_len; xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; *mtu_len = mtu; return ret; } static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .func = bpf_skb_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .func = bpf_xdp_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .func = bpf_lwt_in_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, net->ipv4.udp_table, NULL); #endif } if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); sk = NULL; } return sk; } /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; if (len == sizeof(tuple->ipv4)) family = AF_INET; else if (len == sizeof(tuple->ipv6)) family = AF_INET6; else return NULL; if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (sdif < 0) { if (family == AF_INET) sdif = inet_sdif(skb); else sdif = inet6_sdif(skb); } if ((s32)netns_id < 0) { net = caller_net; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } out: return sk; } static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } static struct sock * bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; if (skb->dev) { caller_net = dev_net(skb->dev); ifindex = skb->dev->ifindex; } else { caller_net = sock_net(skb->sk); ifindex = 0; } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, -1); } static struct sock * bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .func = bpf_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .func = bpf_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .func = bpf_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .func = bpf_tc_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .func = bpf_tc_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .func = bpf_tc_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .func = bpf_xdp_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .func = bpf_xdp_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .func = bpf_xdp_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .func = bpf_sock_addr_skc_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .func = bpf_sock_addr_sk_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .func = bpf_sock_addr_sk_lookup_udp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, icsk_retransmits)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_tcp_sock, bytes_received): case offsetof(struct bpf_tcp_sock, bytes_acked): return size == sizeof(__u64); default: return size == sizeof(__u32); } } u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ } while (0) #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ si->dst_reg, si->src_reg, \ offsetof( \ struct inet_connection_sock, \ FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; case offsetof(struct bpf_tcp_sock, snd_cwnd): BPF_TCP_SOCK_GET_COMMON(snd_cwnd); break; case offsetof(struct bpf_tcp_sock, srtt_us): BPF_TCP_SOCK_GET_COMMON(srtt_us); break; case offsetof(struct bpf_tcp_sock, snd_ssthresh): BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); break; case offsetof(struct bpf_tcp_sock, rcv_nxt): BPF_TCP_SOCK_GET_COMMON(rcv_nxt); break; case offsetof(struct bpf_tcp_sock, snd_nxt): BPF_TCP_SOCK_GET_COMMON(snd_nxt); break; case offsetof(struct bpf_tcp_sock, snd_una): BPF_TCP_SOCK_GET_COMMON(snd_una); break; case offsetof(struct bpf_tcp_sock, mss_cache): BPF_TCP_SOCK_GET_COMMON(mss_cache); break; case offsetof(struct bpf_tcp_sock, ecn_flags): BPF_TCP_SOCK_GET_COMMON(ecn_flags); break; case offsetof(struct bpf_tcp_sock, rate_delivered): BPF_TCP_SOCK_GET_COMMON(rate_delivered); break; case offsetof(struct bpf_tcp_sock, rate_interval_us): BPF_TCP_SOCK_GET_COMMON(rate_interval_us); break; case offsetof(struct bpf_tcp_sock, packets_out): BPF_TCP_SOCK_GET_COMMON(packets_out); break; case offsetof(struct bpf_tcp_sock, retrans_out): BPF_TCP_SOCK_GET_COMMON(retrans_out); break; case offsetof(struct bpf_tcp_sock, total_retrans): BPF_TCP_SOCK_GET_COMMON(total_retrans); break; case offsetof(struct bpf_tcp_sock, segs_in): BPF_TCP_SOCK_GET_COMMON(segs_in); break; case offsetof(struct bpf_tcp_sock, data_segs_in): BPF_TCP_SOCK_GET_COMMON(data_segs_in); break; case offsetof(struct bpf_tcp_sock, segs_out): BPF_TCP_SOCK_GET_COMMON(segs_out); break; case offsetof(struct bpf_tcp_sock, data_segs_out): BPF_TCP_SOCK_GET_COMMON(data_segs_out); break; case offsetof(struct bpf_tcp_sock, lost_out): BPF_TCP_SOCK_GET_COMMON(lost_out); break; case offsetof(struct bpf_tcp_sock, sacked_out): BPF_TCP_SOCK_GET_COMMON(sacked_out); break; case offsetof(struct bpf_tcp_sock, bytes_received): BPF_TCP_SOCK_GET_COMMON(bytes_received); break; case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; case offsetof(struct bpf_tcp_sock, dsack_dups): BPF_TCP_SOCK_GET_COMMON(dsack_dups); break; case offsetof(struct bpf_tcp_sock, delivered): BPF_TCP_SOCK_GET_COMMON(delivered); break; case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; case offsetof(struct bpf_tcp_sock, icsk_retransmits): BPF_INET_SOCK_GET_COMMON(icsk_retransmits); break; } return insn - insn_buf; } BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; } static const struct bpf_func_proto bpf_get_listener_sock_proto = { .func = bpf_get_listener_sock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); break; case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); break; default: return 0; } if (skb_headlen(skb) < iphdr_len) return 0; if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) return 0; return INET_ECN_set_ce(skb); } bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) return false; if (off % size != 0) return false; switch (off) { default: return size == sizeof(__u32); } } u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_XDP_SOCK_GET(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ } while (0) switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); break; } return insn - insn_buf; } static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES int ret; if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (ret > 0) return 0; return -ENOENT; #else return -ENOTSUPP; #endif } static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .func = bpf_tcp_check_syncookie, .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES u32 cookie; u16 mss; if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) return -EINVAL; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); #else return -EOPNOTSUPP; #endif /* CONFIG_SYN_COOKIES */ } static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .func = bpf_tcp_gen_syncookie, .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_pfree; return 0; } static const struct bpf_func_proto bpf_sk_assign_proto = { .func = bpf_sk_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, u8 search_kind, const u8 *magic, u8 magic_len, bool *eol) { u8 kind, kind_len; *eol = false; while (op < opend) { kind = op[0]; if (kind == TCPOPT_EOL) { *eol = true; return ERR_PTR(-ENOMSG); } else if (kind == TCPOPT_NOP) { op++; continue; } if (opend - op < 2 || opend - op < op[1] || op[1] < 2) /* Something is wrong in the received header. * Follow the TCP stack's tcp_parse_options() * and just bail here. */ return ERR_PTR(-EFAULT); kind_len = op[1]; if (search_kind == kind) { if (!magic_len) return op; if (magic_len > kind_len - 2) return ERR_PTR(-ENOMSG); if (!memcmp(&op[2], magic, magic_len)) return op; } op += kind_len; } return ERR_PTR(-ENOMSG); } BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, void *, search_res, u32, len, u64, flags) { bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; const u8 *op, *opend, *magic, *search = search_res; u8 search_kind, search_len, copy_len, magic_len; int ret; /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. */ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) return -EINVAL; search_kind = search[0]; search_len = search[1]; if (search_len > len || search_kind == TCPOPT_NOP || search_kind == TCPOPT_EOL) return -EINVAL; if (search_kind == TCPOPT_EXP || search_kind == 253) { /* 16 or 32 bit magic. +2 for kind and kind length */ if (search_len != 4 && search_len != 6) return -EINVAL; magic = &search[2]; magic_len = search_len - 2; } else { if (search_len) return -EINVAL; magic = NULL; magic_len = 0; } if (load_syn) { ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); if (ret < 0) return ret; opend = op + ret; op += sizeof(struct tcphdr); } else { if (!bpf_sock->skb || bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) /* This bpf_sock->op cannot call this helper */ return -EPERM; opend = bpf_sock->skb_data_end; op = bpf_sock->skb->data + sizeof(struct tcphdr); } op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, &eol); if (IS_ERR(op)) return PTR_ERR(op); copy_len = op[1]; ret = copy_len; if (copy_len > len) { ret = -ENOSPC; copy_len = len; } memcpy(search_res, op, copy_len); return ret; } static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .func = bpf_sock_ops_load_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, const void *, from, u32, len, u64, flags) { u8 new_kind, new_kind_len, magic_len = 0, *opend; const u8 *op, *new_op, *magic = NULL; struct sk_buff *skb; bool eol; if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) return -EPERM; if (len < 2 || flags) return -EINVAL; new_op = from; new_kind = new_op[0]; new_kind_len = new_op[1]; if (new_kind_len > len || new_kind == TCPOPT_NOP || new_kind == TCPOPT_EOL) return -EINVAL; if (new_kind_len > bpf_sock->remaining_opt_len) return -ENOSPC; /* 253 is another experimental kind */ if (new_kind == TCPOPT_EXP || new_kind == 253) { if (new_kind_len < 4) return -EINVAL; /* Match for the 2 byte magic also. * RFC 6994: the magic could be 2 or 4 bytes. * Hence, matching by 2 byte only is on the * conservative side but it is the right * thing to do for the 'search-for-duplication' * purpose. */ magic = &new_op[2]; magic_len = 2; } /* Check for duplication */ skb = bpf_sock->skb; op = skb->data + sizeof(struct tcphdr); opend = bpf_sock->skb_data_end; op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, &eol); if (!IS_ERR(op)) return -EEXIST; if (PTR_ERR(op) != -ENOMSG) return PTR_ERR(op); if (eol) /* The option has been ended. Treat it as no more * header option can be written. */ return -ENOSPC; /* No duplication found. Store the header option. */ memcpy(opend, from, new_kind_len); bpf_sock->remaining_opt_len -= new_kind_len; bpf_sock->skb_data_end += new_kind_len; return 0; } static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .func = bpf_sock_ops_store_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u32, len, u64, flags) { if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) return -EPERM; if (flags || len < 2) return -EINVAL; if (len > bpf_sock->remaining_opt_len) return -ENOSPC; bpf_sock->remaining_opt_len -= len; return 0; } static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .func = bpf_sock_ops_reserve_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; switch (tstamp_type) { case BPF_SKB_CLOCK_REALTIME: skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_REALTIME; break; case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_CLOCK_TAI: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SYN_COOKIES BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th, u32, th_len) { u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; cookie = __cookie_v4_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th, u32, th_len) { #if IS_BUILTIN(CONFIG_IPV6) const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; cookie = __cookie_v6_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .func = bpf_tcp_raw_check_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .func = bpf_tcp_raw_check_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_clone_redirect: case BPF_FUNC_l3_csum_replace: case BPF_FUNC_l4_csum_replace: case BPF_FUNC_lwt_push_encap: case BPF_FUNC_lwt_seg6_action: case BPF_FUNC_lwt_seg6_adjust_srh: case BPF_FUNC_lwt_seg6_store_bytes: case BPF_FUNC_msg_pop_data: case BPF_FUNC_msg_pull_data: case BPF_FUNC_msg_push_data: case BPF_FUNC_skb_adjust_room: case BPF_FUNC_skb_change_head: case BPF_FUNC_skb_change_proto: case BPF_FUNC_skb_change_tail: case BPF_FUNC_skb_pull_data: case BPF_FUNC_skb_store_bytes: case BPF_FUNC_skb_vlan_pop: case BPF_FUNC_skb_vlan_push: case BPF_FUNC_store_hdr_opt: case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: /* tail-called program could call any of the above */ case BPF_FUNC_tail_call: return true; default: return false; } } const struct bpf_func_proto bpf_event_output_data_proto __weak; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_storage_get_proto __weak; const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; case BPF_FUNC_sk_cgroup_id: return &bpf_sk_cgroup_id_proto; case BPF_FUNC_sk_ancestor_cgroup_id: return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; case BPF_FUNC_redirect_peer: return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_skb_cgroup_classid: return &bpf_skb_cgroup_classid_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; case BPF_FUNC_skb_set_tstamp: return &bpf_skb_set_tstamp_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_proto; case BPF_FUNC_xdp_load_bytes: return &bpf_xdp_load_bytes_proto; case BPF_FUNC_xdp_store_bytes: return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; case BPF_FUNC_sk_lookup_tcp: return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will * be included in vmlinux BTF, allowing both modules to refer to the * same type ID. */ BTF_TYPE_EMIT(struct nf_conn___init); #endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; case BPF_FUNC_store_hdr_opt: return &bpf_sock_ops_store_hdr_opt_proto; case BPF_FUNC_reserve_hdr_opt: return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_skb_adjust_room: return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_lwt_push_encap: return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): if (size != size_default) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): if (type == BPF_WRITE || size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; break; case offsetof(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; case offsetof(struct __sk_buff, tstamp_type): return false; case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range_till(struct bpf_sock, type, priority): return false; default: return bpf_sock_is_valid_access(off, size, type, info); } } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_sock, state): case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock, dst_port): field_size = size == size_default ? size_default : sizeof_field(struct bpf_sock, dst_port); bpf_ctx_record_field_size(info, field_size); return bpf_ctx_narrow_access_ok(off, size, field_size); case offsetofend(struct bpf_sock, dst_port) ... offsetof(struct bpf_sock, dst_ip4) - 1: return false; } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_sock_is_valid_access(off, size, type, info)) return false; return __sock_filter_check_attach_type(off, type, prog->expected_attach_type); } static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary * action. */ return 0; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog * has used __sk_buff->tstamp_type or not. * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (prog->expected_attach_type != BPF_XDP_DEVMAP) { switch (off) { case offsetof(struct xdp_md, egress_ifindex): return false; } } if (type == BPF_WRITE) { if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } else { switch (off) { case offsetof(struct xdp_md, data_meta): case offsetof(struct xdp_md, data): case offsetof(struct xdp_md, data_end): if (info->is_ldsx) return false; } } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to fields not belonging to the attach type's address * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (size != size_default) return false; } break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; default: if (type == BPF_READ) { if (size != size_default) return false; } else { return false; } } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; case offsetof(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; case offsetof(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case offsetof(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case offsetof(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct sk_msg_md, remote_port): case bpf_ctx_range(struct sk_msg_md, local_port): case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; break; default: return false; } return true; } static bool flow_dissector_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_FLOW_KEYS; return true; default: return false; } } static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data)); break; case offsetof(struct __sk_buff, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data_end)); break; case offsetof(struct __sk_buff, flow_keys): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, flow_keys)); break; } return insn - insn_buf; } static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); #else BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); #endif return insn; } static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), dst_reg, skb_reg, offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif return insn; } static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* check if ingress mask bits is set */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); *insn++ = BPF_JMP_A(4); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } #define BPF_EMIT_STORE(size, si, off) \ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ (si)->dst_reg, (si)->src_reg, (off), (si)->imm) static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ break; } if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_EMIT_STORE(BPF_H, si, off); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); } break; case offsetof(struct __sk_buff, vlan_present): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_all, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) insn = bpf_convert_tstamp_write(prog, si, insn); else insn = bpf_convert_tstamp_read(prog, si, insn); break; case offsetof(struct __sk_buff, tstamp_type): insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; case offsetof(struct __sk_buff, gso_size): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_size, 2, target_size)); break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; case offsetof(struct __sk_buff, hwtstamp): BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); break; } return insn - insn_buf; } u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_type), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_type, sizeof_field(struct sock, sk_type), target_size)); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_protocol), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_protocol, sizeof_field(struct sock, sk_protocol), target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; case offsetof(struct bpf_sock, dst_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, sizeof_field(struct sock_common, skc_daddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, dst_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); *target_size = 4; #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, sizeof_field(struct sock_common, skc_num), target_size)); break; case offsetof(struct bpf_sock, dst_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, sizeof_field(struct sock_common, skc_dport), target_size)); break; case offsetof(struct bpf_sock, state): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, sizeof_field(struct sock_common, skc_state), target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_rx_queue_mapping, sizeof_field(struct sock, sk_rx_queue_mapping), target_size)); *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); #else *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); *target_size = 2; #endif break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; case offsetof(struct xdp_md, egress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, txq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_txq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF, \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); /* Account for sin6_port being smaller than user_port. */ port_size = min(port_size, BPF_LDST_BYTES(si)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_addr_kern, sk)); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_SK() \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ BPF_MEM | BPF_CLASS(si->code), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD), \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, op), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, op)); break; case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; case offsetof(struct bpf_sock_ops, skb_data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb_data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb_data_end)); break; case offsetof(struct bpf_sock_ops, skb_data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct bpf_sock_ops, skb_len): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): off = offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, tcp_flags); *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, tcp_flags), si->dst_reg, si->dst_reg, off); break; case offsetof(struct bpf_sock_ops, skb_hwtstamp): { struct bpf_insn *jmp_on_null_skb; *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); /* Reserve one insn to test skb == NULL */ jmp_on_null_skb = insn++; insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, insn - jmp_on_null_skb - 1); break; } } return insn - insn_buf; } /* data_end = skb->data + skb_headlen() */ static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { int reg; int temp_reg_off = offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, temp_reg); if (si->src_reg == si->dst_reg) { /* We need an extra register, choose and save a register. */ reg = BPF_REG_9; if (si->src_reg == reg || si->dst_reg == reg) reg--; if (si->src_reg == reg || si->dst_reg == reg) reg--; *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); } else { reg = si->dst_reg; } /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); /* reg = skb->data + skb->len */ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); /* reg = skb->data + skb->len - skb->data_len */ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); if (si->src_reg == si->dst_reg) { /* Restore the saved register */ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); *insn++ = BPF_MOV64_REG(si->dst_reg, reg); *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); } return insn; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif /* convert ctx uses the fact sg element is first in struct */ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct sk_msg_md, size): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; case offsetof(struct sk_msg_md, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: sockopt_release_sock(sk); return ret; } #ifdef CONFIG_INET static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. * * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ err = is_sockarray ? -ENOENT : -EINVAL; goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) { err = -EPROTOTYPE; } else if (sk->sk_family != selected_sk->sk_family) { err = -EAFNOSUPPORT; } else { /* Catch all. Likely bound to a different sockaddr. */ err = -EBADFD; } goto error; } reuse_kern->selected_sk = selected_sk; return 0; error: /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ if (sk_is_refcounted(selected_sk)) sock_put(selected_sk); return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; case offsetof(struct sk_reuseport_md, sk): info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, migrating_sk): info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; return size == sizeof(__u64); /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) return false; fallthrough; case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; case offsetof(struct sk_reuseport_md, migrating_sk): SK_REUSEPORT_LOAD_FIELD(migrating_sk); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); EXPORT_SYMBOL(bpf_sk_lookup_enabled); BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | BPF_SK_LOOKUP_F_NO_REUSEPORT))) return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) return -EPROTOTYPE; if (sk && sk->sk_family != ctx->family && (sk->sk_family == AF_INET || ipv6_only_sock(sk))) return -EAFNOSUPPORT; if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) return -EEXIST; /* Select socket as lookup result */ ctx->selected_sk = sk; ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; return 0; } static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { .func = bpf_sk_lookup_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, .arg3_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_assign: return &bpf_sk_lookup_assign_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static bool sk_lookup_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) return false; if (off % size != 0) return false; if (type != BPF_READ) return false; switch (off) { case offsetof(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); case bpf_ctx_range(struct bpf_sk_lookup, family): case bpf_ctx_range(struct bpf_sk_lookup, protocol): case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); case bpf_ctx_range(struct bpf_sk_lookup, remote_port): /* Allow 4-byte access to 2-byte field for backward compatibility */ if (size == sizeof(__u32)) return true; bpf_ctx_record_field_size(info, sizeof(__be16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); case offsetofend(struct bpf_sk_lookup, remote_port) ... offsetof(struct bpf_sk_lookup, local_ip4) - 1: /* Allow access to zero padding for backward compatibility */ bpf_ctx_record_field_size(info, sizeof(__u16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); default: return false; } } static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sk_lookup, sk): *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, selected_sk)); break; case offsetof(struct bpf_sk_lookup, family): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, family, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, protocol, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, remote_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.saddr, 4, target_size)); break; case offsetof(struct bpf_sk_lookup, local_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.daddr, 4, target_size)); break; case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.saddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.daddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case offsetof(struct bpf_sk_lookup, remote_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, sport, 2, target_size)); break; case offsetofend(struct bpf_sk_lookup, remote_port): *target_size = 2; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); break; case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, ingress_ifindex, 4, target_size)); break; } return insn - insn_buf; } const struct bpf_prog_ops sk_lookup_prog_ops = { .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { .get_func_proto = sk_lookup_func_proto, .is_valid_access = sk_lookup_is_valid_access, .convert_ctx_access = sk_lookup_convert_ctx_access, }; #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) { /* BTF types for tcp_timewait_sock and inet_timewait_sock are not * generated if CONFIG_INET=n. Trigger an explicit generation here. */ BTF_TYPE_EMIT(struct inet_timewait_sock); BTF_TYPE_EMIT(struct tcp_timewait_sock); #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) { #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) { /* udp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct udp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) { /* unix_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .func = bpf_skc_to_unix_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) { BTF_TYPE_EMIT(struct mptcp_sock); return (unsigned long)bpf_mptcp_sock_from_subflow(sk); } const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { .func = bpf_skc_to_mptcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], }; BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); } BTF_ID_LIST(bpf_sock_from_file_btf_ids) BTF_ID(struct, socket) BTF_ID(struct, file) const struct bpf_func_proto bpf_sock_from_file_proto = { .func = bpf_sock_from_file, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = &bpf_sock_from_file_btf_ids[0], .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], }; static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; switch (func_id) { case BPF_FUNC_skc_to_tcp6_sock: func = &bpf_skc_to_tcp6_sock_proto; break; case BPF_FUNC_skc_to_tcp_sock: func = &bpf_skc_to_tcp_sock_proto; break; case BPF_FUNC_skc_to_tcp_timewait_sock: func = &bpf_skc_to_tcp_timewait_sock_proto; break; case BPF_FUNC_skc_to_tcp_request_sock: func = &bpf_skc_to_tcp_request_sock_proto; break; case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; case BPF_FUNC_skc_to_mptcp_sock: func = &bpf_skc_to_mptcp_sock_proto; break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)s; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct xdp_buff *xdp = (struct xdp_buff *)x; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, const u8 *sun_path, u32 sun_path__sz) { struct sockaddr_un *un; if (sa_kern->sk->sk_family != AF_UNIX) return -EINVAL; /* We do not allow changing the address to unnamed or larger than the * maximum allowed address size for a unix sockaddr. */ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) return -EINVAL; un = (struct sockaddr_un *)sa_kern->uaddr; memcpy(un->sun_path, sun_path, sun_path__sz); sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; return 0; } __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; struct net *net; __u16 min_mss; u32 tsoff = 0; if (attrs__sz != sizeof(*attrs) || attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EINVAL; net = dev_net(skb->dev); if (net != sock_net(sk)) return -ENETUNREACH; switch (skb->protocol) { case htons(ETH_P_IP): ops = &tcp_request_sock_ops; min_mss = 536; break; #if IS_BUILTIN(CONFIG_IPV6) case htons(ETH_P_IPV6): ops = &tcp6_request_sock_ops; min_mss = IPV6_MIN_MTU - 60; break; #endif default: return -EINVAL; } if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || sk_is_mptcp(sk)) return -EINVAL; if (attrs->mss < min_mss) return -EINVAL; if (attrs->wscale_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) return -EINVAL; if (attrs->snd_wscale > TCP_MAX_WSCALE || attrs->rcv_wscale > TCP_MAX_WSCALE) return -EINVAL; } if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return -EINVAL; if (attrs->tstamp_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return -EINVAL; tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); } req = inet_reqsk_alloc(ops, sk, false); if (!req) return -ENOMEM; ireq = inet_rsk(req); treq = tcp_rsk(req); req->rsk_listener = sk; req->syncookie = 1; req->mss = attrs->mss; req->ts_recent = attrs->rcv_tsval; ireq->snd_wscale = attrs->snd_wscale; ireq->rcv_wscale = attrs->rcv_wscale; ireq->tstamp_ok = !!attrs->tstamp_ok; ireq->sack_ok = !!attrs->sack_ok; ireq->wscale_ok = !!attrs->wscale_ok; ireq->ecn_ok = !!attrs->ecn_ok; treq->req_usec_ts = !!attrs->usec_ts_ok; treq->ts_off = tsoff; skb_orphan(skb); skb->sk = req_to_sk(req); skb->destructor = sock_pfree; return 0; #else return -EOPNOTSUPP; #endif } __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_addr, }; static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_tcp_reqsk, }; static int __init bpf_kfunc_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); __bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * * The function expects a non-NULL pointer to a socket, and invokes the * protocol specific socket destroy handlers. * * The helper can only be called from BPF contexts that have acquired the socket * locks. * * Parameters: * @sock: Pointer to socket to be destroyed * * Return: * On error, may return EPROTONOSUPPORT, EINVAL. * EPROTONOSUPPORT if protocol specific destroy handler is not supported. * 0 otherwise */ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) { struct sock *sk = (struct sock *)sock; /* The locking semantics that allow for synchronous execution of the * destroy handlers are only supported for TCP and UDP. * Supporting protocols will need to acquire sock lock in the BPF context * prior to invoking this kfunc. */ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_UDP)) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && prog->expected_attach_type != BPF_TRACE_ITER) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_sk_iter_kfunc_ids, .filter = tracing_iter_filter, }; static int init_subsystem(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); } late_initcall(init_subsystem);
5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2021 Google LLC * * sysfs support for blk-crypto. This file contains the code which exports the * crypto capabilities of devices via /sys/block/$disk/queue/crypto/. */ #include <linux/blk-crypto-profile.h> #include "blk-crypto-internal.h" struct blk_crypto_kobj { struct kobject kobj; struct blk_crypto_profile *profile; }; struct blk_crypto_attr { struct attribute attr; ssize_t (*show)(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page); }; static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) { return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; } static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) { return container_of(attr, struct blk_crypto_attr, attr); } static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); } static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", profile->num_slots); } #define BLK_CRYPTO_RO_ATTR(_name) \ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) BLK_CRYPTO_RO_ATTR(max_dun_bits); BLK_CRYPTO_RO_ATTR(num_keyslots); static struct attribute *blk_crypto_attrs[] = { &max_dun_bits_attr.attr, &num_keyslots_attr.attr, NULL, }; static const struct attribute_group blk_crypto_attr_group = { .attrs = blk_crypto_attrs, }; /* * The encryption mode attributes. To avoid hard-coding the list of encryption * modes, these are initialized at boot time by blk_crypto_sysfs_init(). */ static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, struct attribute *attr, int n) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); struct blk_crypto_attr *a = attr_to_crypto_attr(attr); int mode_num = a - __blk_crypto_mode_attrs; if (profile->modes_supported[mode_num]) return 0444; return 0; } static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { int mode_num = attr - __blk_crypto_mode_attrs; return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]); } static const struct attribute_group blk_crypto_modes_attr_group = { .name = "modes", .attrs = blk_crypto_mode_attrs, .is_visible = blk_crypto_mode_is_visible, }; static const struct attribute_group *blk_crypto_attr_groups[] = { &blk_crypto_attr_group, &blk_crypto_modes_attr_group, NULL, }; static ssize_t blk_crypto_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); struct blk_crypto_attr *a = attr_to_crypto_attr(attr); return a->show(profile, a, page); } static const struct sysfs_ops blk_crypto_attr_ops = { .show = blk_crypto_attr_show, }; static void blk_crypto_release(struct kobject *kobj) { kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); } static const struct kobj_type blk_crypto_ktype = { .default_groups = blk_crypto_attr_groups, .sysfs_ops = &blk_crypto_attr_ops, .release = blk_crypto_release, }; /* * If the request_queue has a blk_crypto_profile, create the "crypto" * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). */ int blk_crypto_sysfs_register(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_crypto_kobj *obj; int err; if (!q->crypto_profile) return 0; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return -ENOMEM; obj->profile = q->crypto_profile; err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &disk->queue_kobj, "crypto"); if (err) { kobject_put(&obj->kobj); return err; } q->crypto_kobject = &obj->kobj; return 0; } void blk_crypto_sysfs_unregister(struct gendisk *disk) { kobject_put(disk->queue->crypto_kobject); } static int __init blk_crypto_sysfs_init(void) { int i; BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) { struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i]; attr->attr.name = blk_crypto_modes[i].name; attr->attr.mode = 0444; attr->show = blk_crypto_mode_show; blk_crypto_mode_attrs[i - 1] = &attr->attr; } return 0; } subsys_initcall(blk_crypto_sysfs_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/arp.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/spinlock.h> #include <net/netrom.h> #include <linux/seq_file.h> #include <linux/export.h> static unsigned int nr_neigh_no = 1; static HLIST_HEAD(nr_node_list); static DEFINE_SPINLOCK(nr_node_list_lock); static HLIST_HEAD(nr_neigh_list); static DEFINE_SPINLOCK(nr_neigh_list_lock); static struct nr_node *nr_node_get(ax25_address *callsign) { struct nr_node *found = NULL; struct nr_node *nr_node; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) if (ax25cmp(callsign, &nr_node->callsign) == 0) { nr_node_hold(nr_node); found = nr_node; break; } spin_unlock_bh(&nr_node_list_lock); return found; } static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, struct net_device *dev) { struct nr_neigh *found = NULL; struct nr_neigh *nr_neigh; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(nr_neigh, &nr_neigh_list) if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && nr_neigh->dev == dev) { nr_neigh_hold(nr_neigh); found = nr_neigh; break; } spin_unlock_bh(&nr_neigh_list_lock); return found; } static void nr_remove_neigh(struct nr_neigh *); /* re-sort the routes in quality order. */ static void re_sort_routes(struct nr_node *nr_node, int x, int y) { if (nr_node->routes[y].quality > nr_node->routes[x].quality) { if (nr_node->which == x) nr_node->which = y; else if (nr_node->which == y) nr_node->which = x; swap(nr_node->routes[x], nr_node->routes[y]); } } /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. */ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev, int quality, int obs_count) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i, found; struct net_device *odev; if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */ dev_put(odev); return -EINVAL; } nr_node = nr_node_get(nr); nr_neigh = nr_neigh_get_dev(ax25, dev); /* * The L2 link to a neighbour has failed in the past * and now a frame comes from this neighbour. We assume * it was a temporary trouble with the link and reset the * routes now (and not wait for a node broadcast). */ if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { struct nr_node *nr_nodet; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_nodet, &nr_node_list) { nr_node_lock(nr_nodet); for (i = 0; i < nr_nodet->count; i++) if (nr_nodet->routes[i].neighbour == nr_neigh) if (i < nr_nodet->which) nr_nodet->which = i; nr_node_unlock(nr_nodet); } spin_unlock_bh(&nr_node_list_lock); } if (nr_neigh != NULL) nr_neigh->failed = 0; if (quality == 0 && nr_neigh != NULL && nr_node != NULL) { nr_neigh_put(nr_neigh); nr_node_put(nr_node); return 0; } if (nr_neigh == NULL) { if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) { if (nr_node) nr_node_put(nr_node); return -ENOMEM; } nr_neigh->callsign = *ax25; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = READ_ONCE(sysctl_netrom_default_path_quality); nr_neigh->locked = 0; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); if (nr_node) nr_node_put(nr_node); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); nr_neigh_hold(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked) nr_neigh->quality = quality; if (nr_node == NULL) { if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) { if (nr_neigh) nr_neigh_put(nr_neigh); return -ENOMEM; } nr_node->callsign = *nr; strscpy(nr_node->mnemonic, mnemonic); nr_node->which = 0; nr_node->count = 1; refcount_set(&nr_node->refcount, 1); spin_lock_init(&nr_node->node_lock); nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; spin_lock_bh(&nr_node_list_lock); hlist_add_head(&nr_node->node_node, &nr_node_list); /* refcount initialized at 1 */ spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); return 0; } nr_node_lock(nr_node); if (quality != 0) strscpy(nr_node->mnemonic, mnemonic); for (found = 0, i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_node->routes[i].quality = quality; nr_node->routes[i].obs_count = obs_count; found = 1; break; } } if (!found) { /* We have space at the bottom, slot it in */ if (nr_node->count < 3) { nr_node->routes[2] = nr_node->routes[1]; nr_node->routes[1] = nr_node->routes[0]; nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_node->which++; nr_node->count++; nr_neigh_hold(nr_neigh); nr_neigh->count++; } else { /* It must be better than the worst */ if (quality > nr_node->routes[2].quality) { nr_node->routes[2].neighbour->count--; nr_neigh_put(nr_node->routes[2].neighbour); if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) nr_remove_neigh(nr_node->routes[2].neighbour); nr_node->routes[2].quality = quality; nr_node->routes[2].obs_count = obs_count; nr_node->routes[2].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; } } } /* Now re-sort the routes in quality order */ switch (nr_node->count) { case 3: re_sort_routes(nr_node, 0, 1); re_sort_routes(nr_node, 1, 2); fallthrough; case 2: re_sort_routes(nr_node, 0, 1); break; case 1: break; } for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { if (i < nr_node->which) nr_node->which = i; break; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } static void nr_remove_node_locked(struct nr_node *nr_node) { lockdep_assert_held(&nr_node_list_lock); hlist_del_init(&nr_node->node_node); nr_node_put(nr_node); } static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) { hlist_del_init(&nr_neigh->neigh_node); nr_neigh_put(nr_neigh); } #define nr_remove_neigh_locked(__neigh) \ __nr_remove_neigh(__neigh) static void nr_remove_neigh(struct nr_neigh *nr_neigh) { spin_lock_bh(&nr_neigh_list_lock); __nr_remove_neigh(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } /* * "Delete" a node. Strictly speaking remove a route to a node. The node * is only deleted if no routes are left to it. */ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i; nr_node = nr_node_get(callsign); if (nr_node == NULL) return -EINVAL; nr_neigh = nr_neigh_get_dev(neighbour, dev); if (nr_neigh == NULL) { nr_node_put(nr_node); return -EINVAL; } spin_lock_bh(&nr_node_list_lock); nr_node_lock(nr_node); for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); nr_node->count--; if (nr_node->count == 0) { nr_remove_node_locked(nr_node); } else { switch (i) { case 0: nr_node->routes[0] = nr_node->routes[1]; fallthrough; case 1: nr_node->routes[1] = nr_node->routes[2]; fallthrough; case 2: break; } nr_node_put(nr_node); } nr_node_unlock(nr_node); spin_unlock_bh(&nr_node_list_lock); return 0; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); spin_unlock_bh(&nr_node_list_lock); nr_node_put(nr_node); return -EINVAL; } /* * Lock a neighbour with a quality. */ static int __must_check nr_add_neigh(ax25_address *callsign, ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh) { nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh_put(nr_neigh); return 0; } if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) return -ENOMEM; nr_neigh->callsign = *callsign; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); /* refcount is initialized at 1 */ spin_unlock_bh(&nr_neigh_list_lock); return 0; } /* * "Delete" a neighbour. The neighbour is only removed if the number * of nodes that may use it is zero. */ static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh == NULL) return -EINVAL; nr_neigh->quality = quality; nr_neigh->locked = 0; if (nr_neigh->count == 0) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); return 0; } /* * Decrement the obsolescence count by one. If a route is reduced to a * count of zero, remove it. Also remove any unlocked neighbours with * zero nodes routing via it. */ static int nr_dec_obs(void) { struct nr_neigh *nr_neigh; struct nr_node *s; struct hlist_node *nodet; int i; spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(s, nodet, &nr_node_list) { nr_node_lock(s); for (i = 0; i < s->count; i++) { switch (s->routes[i].obs_count) { case 0: /* A locked entry */ break; case 1: /* From 1 -> 0 */ nr_neigh = s->routes[i].neighbour; nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); s->count--; switch (i) { case 0: s->routes[0] = s->routes[1]; fallthrough; case 1: s->routes[1] = s->routes[2]; break; case 2: break; } break; default: s->routes[i].obs_count--; break; } } if (s->count <= 0) nr_remove_node_locked(s); nr_node_unlock(s); } spin_unlock_bh(&nr_node_list_lock); return 0; } /* * A device has been removed. Remove its routes and neighbours. */ void nr_rt_device_down(struct net_device *dev) { struct nr_neigh *s; struct hlist_node *nodet, *node2t; struct nr_node *t; int i; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { if (s->dev == dev) { spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, node2t, &nr_node_list) { nr_node_lock(t); for (i = 0; i < t->count; i++) { if (t->routes[i].neighbour == s) { t->count--; switch (i) { case 0: t->routes[0] = t->routes[1]; fallthrough; case 1: t->routes[1] = t->routes[2]; break; case 2: break; } } } if (t->count <= 0) nr_remove_node_locked(t); nr_node_unlock(t); } spin_unlock_bh(&nr_node_list_lock); nr_remove_neigh_locked(s); } } spin_unlock_bh(&nr_neigh_list_lock); } /* * Check that the device given is a valid AX.25 interface that is "up". * Or a valid ethernet interface with an AX.25 callsign binding. */ static struct net_device *nr_ax25_dev_get(char *devname) { struct net_device *dev; if ((dev = dev_get_by_name(&init_net, devname)) == NULL) return NULL; if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; dev_put(dev); return NULL; } /* * Find the first active NET/ROM device, usually "nr0". */ struct net_device *nr_dev_first(void) { struct net_device *dev, *first = NULL; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } dev_hold(first); rcu_read_unlock(); return first; } /* * Find the NET/ROM device for the given callsign. */ struct net_device *nr_dev_get(ax25_address *addr) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (const ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } dev = NULL; out: rcu_read_unlock(); return dev; } static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis, ax25_address *digipeaters) { int i; if (ndigis == 0) return NULL; for (i = 0; i < ndigis; i++) { digi->calls[i] = digipeaters[i]; digi->repeated[i] = 0; } digi->ndigi = ndigis; digi->lastrepeat = -1; return digi; } /* * Handle the ioctls that control the routing functions. */ int nr_rt_ioctl(unsigned int cmd, void __user *arg) { struct nr_route_struct nr_route; struct net_device *dev; ax25_digi digi; int ret; switch (cmd) { case SIOCADDRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if (nr_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: if (strnlen(nr_route.mnemonic, 7) == 7) { ret = -EINVAL; break; } ret = nr_add_node(&nr_route.callsign, nr_route.mnemonic, &nr_route.neighbour, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality, nr_route.obs_count); break; case NETROM_NEIGH: ret = nr_add_neigh(&nr_route.callsign, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCDELRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: ret = nr_del_node(&nr_route.callsign, &nr_route.neighbour, dev); break; case NETROM_NEIGH: ret = nr_del_neigh(&nr_route.callsign, dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCNRDECOBS: return nr_dec_obs(); default: return -EINVAL; } return 0; } /* * A level 2 link has timed out, therefore it appears to be a poor link, * then don't use that neighbour until it is reset. */ void nr_link_failed(ax25_cb *ax25, int reason) { struct nr_neigh *s, *nr_neigh = NULL; struct nr_node *nr_node = NULL; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(s, &nr_neigh_list) { if (s->ax25 == ax25) { nr_neigh_hold(s); nr_neigh = s; break; } } spin_unlock_bh(&nr_neigh_list_lock); if (nr_neigh == NULL) return; nr_neigh->ax25 = NULL; ax25_cb_put(ax25); if (++nr_neigh->failed < READ_ONCE(sysctl_netrom_link_fails_count)) { nr_neigh_put(nr_neigh); return; } spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) { nr_node_lock(nr_node); if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) nr_node->which++; nr_node_unlock(nr_node); } spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); } /* * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb * indicates an internally generated frame. */ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) { ax25_address *nr_src, *nr_dest; struct nr_neigh *nr_neigh; struct nr_node *nr_node; struct net_device *dev; unsigned char *dptr; ax25_cb *ax25s; int ret; struct sk_buff *skbn; /* * Reject malformed packets early. Check that it contains at least 2 * addresses and 1 byte more for Time-To-Live */ if (skb->len < 2 * sizeof(ax25_address) + 1) return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); if (ax25 != NULL) { ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, ax25->ax25_dev->dev, 0, READ_ONCE(sysctl_netrom_obsolescence_count_initialiser)); if (ret) return ret; } if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ if (ax25 == NULL) /* Its from me */ ret = nr_loopback_queue(skb); else ret = nr_rx_frame(skb, dev); dev_put(dev); return ret; } if (!READ_ONCE(sysctl_netrom_routing_control) && ax25 != NULL) return 0; /* Its Time-To-Live has expired */ if (skb->data[14] == 1) { return 0; } nr_node = nr_node_get(nr_dest); if (nr_node == NULL) return 0; nr_node_lock(nr_node); if (nr_node->which >= nr_node->count) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } nr_neigh = nr_node->routes[nr_node->which].neighbour; if ((dev = nr_dev_first()) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } /* We are going to change the netrom headers so we should get our own skb, we also did not know until now how much header space we had to reserve... - RXQ */ if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); dev_put(dev); return 0; } kfree_skb(skb); skb=skbn; skb->data[14]--; dptr = skb_push(skb, 1); *dptr = AX25_P_NETROM; ax25s = nr_neigh->ax25; nr_neigh->ax25 = ax25_send_frame(skb, 256, (const ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); if (ax25s) ax25_cb_put(ax25s); dev_put(dev); ret = (nr_neigh->ax25 != NULL); nr_node_unlock(nr_node); nr_node_put(nr_node); return ret; } #ifdef CONFIG_PROC_FS static void *nr_node_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_node_list_lock) { spin_lock_bh(&nr_node_list_lock); return seq_hlist_start_head(&nr_node_list, *pos); } static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_node_list, pos); } static void nr_node_stop(struct seq_file *seq, void *v) __releases(&nr_node_list_lock) { spin_unlock_bh(&nr_node_list_lock); } static int nr_node_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); else { struct nr_node *nr_node = hlist_entry(v, struct nr_node, node_node); nr_node_lock(nr_node); seq_printf(seq, "%-9s %-7s %d %d", ax2asc(buf, &nr_node->callsign), (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, nr_node->which + 1, nr_node->count); for (i = 0; i < nr_node->count; i++) { seq_printf(seq, " %3d %d %05d", nr_node->routes[i].quality, nr_node->routes[i].obs_count, nr_node->routes[i].neighbour->number); } nr_node_unlock(nr_node); seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_node_seqops = { .start = nr_node_start, .next = nr_node_next, .stop = nr_node_stop, .show = nr_node_show, }; static void *nr_neigh_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_neigh_list_lock) { spin_lock_bh(&nr_neigh_list_lock); return seq_hlist_start_head(&nr_neigh_list, *pos); } static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_neigh_list, pos); } static void nr_neigh_stop(struct seq_file *seq, void *v) __releases(&nr_neigh_list_lock) { spin_unlock_bh(&nr_neigh_list_lock); } static int nr_neigh_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n"); else { struct nr_neigh *nr_neigh; nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node); seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d", nr_neigh->number, ax2asc(buf, &nr_neigh->callsign), nr_neigh->dev ? nr_neigh->dev->name : "???", nr_neigh->quality, nr_neigh->locked, nr_neigh->count, nr_neigh->failed); if (nr_neigh->digipeat != NULL) { for (i = 0; i < nr_neigh->digipeat->ndigi; i++) seq_printf(seq, " %s", ax2asc(buf, &nr_neigh->digipeat->calls[i])); } seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_neigh_seqops = { .start = nr_neigh_start, .next = nr_neigh_next, .stop = nr_neigh_stop, .show = nr_neigh_show, }; #endif /* * Free all memory associated with the nodes and routes lists. */ void nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; struct hlist_node *nodet; spin_lock_bh(&nr_neigh_list_lock); spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, nodet, &nr_node_list) { nr_node_lock(t); nr_remove_node_locked(t); nr_node_unlock(t); } nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { while(s->count) { s->count--; nr_neigh_put(s); } nr_remove_neigh_locked(s); } spin_unlock_bh(&nr_node_list_lock); spin_unlock_bh(&nr_neigh_list_lock); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/rculist.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/dst_metadata.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <net/netns/generic.h> #include <net/inet_dscp.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_hdr; #endif /* default value is 1 */ int call_iptables; int call_ip6tables; int call_arptables; /* default value is 0 */ int filter_vlan_tagged; int filter_pppoe_tagged; int pass_vlan_indev; }; #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) #define IS_IPV6(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) #define IS_ARP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) static inline __be16 vlan_proto(const struct sk_buff *skb) { if (skb_vlan_tag_present(skb)) return skb->protocol; else if (skb->protocol == htons(ETH_P_8021Q)) return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; else return 0; } static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; } static inline bool is_vlan_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IPV6) && brnet->filter_vlan_tagged; } static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; } static inline __be16 pppoe_proto(const struct sk_buff *skb) { return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + sizeof(struct pppoe_hdr))); } static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; } static inline bool is_pppoe_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IPV6) && brnet->filter_pppoe_tagged; } /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { local_lock_t bh_lock; char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; u16 vlan_tci; __be16 vlan_proto; }; static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static void nf_bridge_info_free(struct sk_buff *skb) { skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) { struct net_bridge_port *port; port = br_port_get_rcu(dev); return port ? port->br->dev : NULL; } static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { case __cpu_to_be16(ETH_P_8021Q): return VLAN_HLEN; case __cpu_to_be16(ETH_P_PPP_SES): return PPPOE_SES_HLEN; default: return 0; } } static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull(skb, len); skb->network_header += len; } static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull_rcsum(skb, len); skb->network_header += len; } /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format */ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) { const struct iphdr *iph; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* Basic sanity checks */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); /* We should really parse IP options here but until * somebody who actually uses IP options complains to * us we'll just silently ignore the options because * we're lazy! */ return 0; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } void nf_bridge_update_protocol(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; case BRNF_PROTO_PPPOE: skb->protocol = htons(ETH_P_PPP_SES); break; case BRNF_PROTO_UNCHANGED: break; } } /* Obtain the correct destination MAC address, while preserving the original * source MAC address. If we already know this address, we just copy it. If we * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; skb->dev = bridge_parent(skb->dev); if (!skb->dev) goto free_skb; dst = skb_dst(skb); neigh = dst_neigh_lookup_skb(dst, skb); if (neigh) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) && READ_ONCE(neigh->hh.hh_len)) { struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { neigh_release(neigh); goto free_skb; } neigh_hh_bridge(&neigh->hh, skb); skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and * protocol number. */ skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = READ_ONCE(neigh->output)(neigh, skb); } neigh_release(neigh); return ret; } free_skb: kfree_skb(skb); return 0; } static inline bool br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); enum skb_drop_reason reason; struct rtable *rt; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { kfree_skb(skb); return 0; } nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { reason = ip_route_input(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (reason) { kfree_skb_reason(skb, reason); return 0; } else { if (skb_dst(skb)->dev == dev) { skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev, const struct net *net) { struct net_device *vlan, *br; struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, skb_vlan_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; } /* Some common code for IPv4/IPv6 */ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge->in_prerouting = 1; nf_bridge->physinif = skb->dev->ifindex; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; else if (skb->protocol == htons(ETH_P_PPP_SES)) nf_bridge->orig_proto = BRNF_PROTO_PPPOE; /* Must drop socket now because of tproxy. */ skb_orphan(skb); return skb->dev; } /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the * receiving device) to make netfilter happy, the REDIRECT * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); p = br_port_get_rcu(state->in); if (p == NULL) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); br = p->br; brnet = net_generic(state->net, brnf_net_id); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) { if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; if (!ipv6_mod_enabled()) { pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported."); return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0); } nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); } if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); if (!nf_bridge_alloc(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); if (!setup_pre_routing(skb, state->net)) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IP); skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4; NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* conntracks' nf_confirm logic cannot handle cloned skbs referencing * the same nf_conn entry, which will happen for multicast (broadcast) * Frames on bridges. * * Example: * macvlan0 * br0 * ethX ethY * * ethX (or Y) receives multicast or broadcast packet containing * an IP packet, not yet in conntrack table. * * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. * -> skb->_nfct now references a unconfirmed entry * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge * interface. * 3. skb gets passed up the stack. * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb * and schedules a work queue to send them out on the lower devices. * * The clone skb->_nfct is not a copy, it is the same entry as the * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. * * The Macvlan broadcast worker and normal confirm path will race. * * This race will not happen if step 2 already confirmed a clone. In that * case later steps perform skb_clone() with skb->_nfct already confirmed (in * hash table). This works fine. * * But such confirmation won't happen when eb/ip/nftables rules dropped the * packets before they reached the nf_confirm step in postrouting. * * Work around this problem by explicit confirmation of the entry at * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed * entry. * */ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; if (promisc) { nf_reset_ct(skb); return NF_ACCEPT; } if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; ct = container_of(nfct, struct nf_conn, ct_general); if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { nf_reset_ct(skb); return NF_ACCEPT; } WARN_ON_ONCE(skb_shared(skb)); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. */ ct_hook = rcu_dereference(nf_ct_hook); if (!ct_hook) { skb->_nfct = 0ul; nf_conntrack_put(nfct); return NF_ACCEPT; } nf_bridge_pull_encap_header(skb); ret = ct_hook->confirm(skb); switch (ret & NF_VERDICT_MASK) { case NF_STOLEN: return NF_STOLEN; default: nf_bridge_push_encap_header(skb); break; } ct = container_of(nfct, struct nf_conn, ct_general); WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); return ret; } #endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; in = nf_bridge_get_physindev(skb, net); if (!in) { kfree_skb(skb); return 0; } if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge_update_protocol(skb); } else { in = *((struct net_device **)(skb->cb)); } nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev, br_forward_finish); return 0; } static unsigned int br_nf_forward_ip(struct sk_buff *skb, const struct nf_hook_state *state, u8 pf) { struct nf_bridge_info *nf_bridge; struct net_device *parent; nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple * different physoutdevs. */ if (!nf_bridge_unshare(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); parent = bridge_parent(state->out); if (!parent) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge_pull_encap_header(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } if (pf == NFPROTO_IPV4) { if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IP); } else if (pf == NFPROTO_IPV6) { if (br_validate_ipv6(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IPV6); } else { WARN_ON_ONCE(1); return NF_DROP; } nf_bridge->physoutdev = skb->dev; NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; } static unsigned int br_nf_forward_arp(struct sk_buff *skb, const struct nf_hook_state *state) { struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; brnet = net_generic(state->net, brnf_net_id); if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (is_vlan_arp(skb, state->net)) nf_bridge_pull_encap_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); if (arp_hdr(skb)->ar_pln != 4) { if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } *d = state->in; NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev * because of the physdev module. For ARP, indev and outdev are the * bridge ports. */ static unsigned int br_nf_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV4); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV6); if (IS_ARP(skb) || is_vlan_arp(skb, state->net)) return br_nf_forward_arp(skb, state); return NF_ACCEPT; } static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; int err; data = this_cpu_ptr(&brnf_frag_data_storage); err = skb_cow_head(skb, data->size); if (err) { kfree_skb(skb); return 0; } if (data->vlan_proto) __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(sk, skb); struct iphdr *iph = ip_hdr(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } /* Fragmentation on metadata/template dst is not supported */ if (unlikely(!skb_valid_dst(skb))) goto drop; /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; } else { data->vlan_proto = 0; } data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); if (v6ops) { ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; } nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; } /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *realoutdev = bridge_parent(skb->dev); u_int8_t pf; /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in * on a bridge, but was delivered locally and is now being routed: * * POST_ROUTING was already invoked from the ip stack. */ if (!nf_bridge || !nf_bridge->physoutdev) return NF_ACCEPT; if (!realoutdev) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge_pull_encap_header(skb); if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; } /* IP/SABOTAGE *****************************************************/ /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING * for the second time. */ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { if (nf_bridge->sabotage_in_done) return NF_ACCEPT; if (!nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { nf_bridge->sabotage_in_done = 1; state->okfn(state->net, state->sk, skb); return NF_STOLEN; } } return NF_ACCEPT; } /* This is called when br_netfilter has called into iptables/netfilter, * and DNAT has taken place on a bridge-forwarded packet. * * neigh->output has created a new MAC header, with local br0 MAC * as saddr. * * This restores the original MAC saddr of the bridged packet * before invoking bridge forward logic to transmit the packet. */ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev)); if (!br_indev) { kfree_skb(skb); return; } skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); skb->dev = br_indev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); } static int br_nf_dev_xmit(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } return 0; } static const struct nf_br_ops br_ops = { .br_dev_xmit_hook = br_nf_dev_xmit, }; /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static const struct nf_hook_ops br_nf_ops[] = { { .hook = br_nf_pre_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, #if IS_ENABLED(CONFIG_NF_CONNTRACK) { .hook = br_nf_local_in, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_LAST, }, #endif { .hook = br_nf_forward, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF, }, { .hook = br_nf_post_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_LAST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; static int brnf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct brnf_net *brnet; struct net *net; int ret; if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev)) return NOTIFY_DONE; ASSERT_RTNL(); net = dev_net(dev); brnet = net_generic(net, brnf_net_id); if (brnet->enabled) return NOTIFY_OK; ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret) return NOTIFY_BAD; brnet->enabled = true; return NOTIFY_OK; } static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; /* recursively invokes nf_hook_slow (again), skipping already-called * hooks (< NF_BR_PRI_BRNF). * * Called with rcu read lock held. */ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { const struct nf_hook_entries *e; struct nf_hook_state state; struct nf_hook_ops **ops; unsigned int i; int ret; e = rcu_dereference(net->nf.hooks_bridge[hook]); if (!e) return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); for (i = 0; i < e->num_hook_entries; i++) { /* These hooks have already been called */ if (ops[i]->priority < NF_BR_PRI_BRNF) continue; /* These hooks have not been called yet, run them. */ if (ops[i]->priority > NF_BR_PRI_BRNF) break; /* take a closer look at NF_BR_PRI_BRNF. */ if (ops[i]->hook == br_nf_pre_routing) { /* This hook diverted the skb to this function, * hooks after this have not been run yet. */ i++; break; } } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, e, i); if (ret == 1) ret = okfn(net, sk, skb); return ret; } #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; return ret; } static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) { brnf->call_iptables = 1; brnf->call_ip6tables = 1; brnf->call_arptables = 1; brnf->filter_vlan_tagged = 0; brnf->filter_pppoe_tagged = 0; brnf->pass_vlan_indev = 0; } static int br_netfilter_sysctl_init_net(struct net *net) { struct ctl_table *table = brnf_table; struct brnf_net *brnet; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); if (!table) return -ENOMEM; } brnet = net_generic(net, brnf_net_id); table[0].data = &brnet->call_arptables; table[1].data = &brnet->call_iptables; table[2].data = &brnet->call_ip6tables; table[3].data = &brnet->filter_vlan_tagged; table[4].data = &brnet->filter_pppoe_tagged; table[5].data = &brnet->pass_vlan_indev; br_netfilter_sysctl_default(brnet); brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table, ARRAY_SIZE(brnf_table)); if (!brnet->ctl_hdr) { if (!net_eq(net, &init_net)) kfree(table); return -ENOMEM; } return 0; } static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) kfree(table); } static int __net_init brnf_init_net(struct net *net) { return br_netfilter_sysctl_init_net(net); } #endif static void __net_exit brnf_exit_net(struct net *net) { struct brnf_net *brnet; brnet = net_generic(net, brnf_net_id); if (brnet->enabled) { nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); brnet->enabled = false; } #ifdef CONFIG_SYSCTL br_netfilter_sysctl_exit_net(net, brnet); #endif } static struct pernet_operations brnf_net_ops __read_mostly = { #ifdef CONFIG_SYSCTL .init = brnf_init_net, #endif .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), }; static int __init br_netfilter_init(void) { int ret; ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; ret = register_netdevice_notifier(&brnf_notifier); if (ret < 0) { unregister_pernet_subsys(&brnf_net_ops); return ret; } RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; } static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); } module_init(br_netfilter_init); module_exit(br_netfilter_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");
15 15 38 38 38 38 3 3 3 38 38 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 // SPDX-License-Identifier: GPL-2.0 /* * dax: direct host memory access * Copyright (C) 2020 Red Hat, Inc. */ #include "fuse_i.h" #include <linux/delay.h> #include <linux/dax.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> /* * Default memory range size. A power of 2 so it agrees with common FUSE_INIT * map_alignment values 4KB and 64KB. */ #define FUSE_DAX_SHIFT 21 #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) /* Number of ranges reclaimer will try to free in one invocation */ #define FUSE_DAX_RECLAIM_CHUNK (10) /* * Dax memory reclaim threshold in percetage of total ranges. When free * number of free ranges drops below this threshold, reclaim can trigger * Default is 20% */ #define FUSE_DAX_RECLAIM_THRESHOLD (20) /** Translation information for file offsets to DAX window offsets */ struct fuse_dax_mapping { /* Pointer to inode where this memory range is mapped */ struct inode *inode; /* Will connect in fcd->free_ranges to keep track of free memory */ struct list_head list; /* For interval tree in file/inode */ struct interval_tree_node itn; /* Will connect in fc->busy_ranges to keep track busy memory */ struct list_head busy_list; /** Position in DAX window */ u64 window_offset; /** Length of mapping, in bytes */ loff_t length; /* Is this mapping read-only or read-write */ bool writable; /* reference count when the mapping is used by dax iomap. */ refcount_t refcnt; }; /* Per-inode dax map */ struct fuse_inode_dax { /* Semaphore to protect modifications to the dmap tree */ struct rw_semaphore sem; /* Sorted rb tree of struct fuse_dax_mapping elements */ struct rb_root_cached tree; unsigned long nr; }; struct fuse_conn_dax { /* DAX device */ struct dax_device *dev; /* Lock protecting accessess to members of this structure */ spinlock_t lock; /* List of memory ranges which are busy */ unsigned long nr_busy_ranges; struct list_head busy_ranges; /* Worker to free up memory ranges */ struct delayed_work free_work; /* Wait queue for a dax range to become free */ wait_queue_head_t range_waitq; /* DAX Window Free Ranges */ long nr_free_ranges; struct list_head free_ranges; unsigned long nr_ranges; }; static inline struct fuse_dax_mapping * node_to_dmap(struct interval_tree_node *node) { if (!node) return NULL; return container_of(node, struct fuse_dax_mapping, itn); } static struct fuse_dax_mapping * alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); static void __kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) { unsigned long free_threshold; /* If number of free ranges are below threshold, start reclaim */ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, 1); if (fcd->nr_free_ranges < free_threshold) queue_delayed_work(system_long_wq, &fcd->free_work, msecs_to_jiffies(delay_ms)); } static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) { spin_lock(&fcd->lock); __kick_dmap_free_worker(fcd, delay_ms); spin_unlock(&fcd->lock); } static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) { struct fuse_dax_mapping *dmap; spin_lock(&fcd->lock); dmap = list_first_entry_or_null(&fcd->free_ranges, struct fuse_dax_mapping, list); if (dmap) { list_del_init(&dmap->list); WARN_ON(fcd->nr_free_ranges <= 0); fcd->nr_free_ranges--; } __kick_dmap_free_worker(fcd, 0); spin_unlock(&fcd->lock); return dmap; } /* This assumes fcd->lock is held */ static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap) { list_del_init(&dmap->busy_list); WARN_ON(fcd->nr_busy_ranges == 0); fcd->nr_busy_ranges--; } static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap) { spin_lock(&fcd->lock); __dmap_remove_busy_list(fcd, dmap); spin_unlock(&fcd->lock); } /* This assumes fcd->lock is held */ static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap) { list_add_tail(&dmap->list, &fcd->free_ranges); fcd->nr_free_ranges++; wake_up(&fcd->range_waitq); } static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap) { /* Return fuse_dax_mapping to free list */ spin_lock(&fcd->lock); __dmap_add_to_free_pool(fcd, dmap); spin_unlock(&fcd->lock); } static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, struct fuse_dax_mapping *dmap, bool writable, bool upgrade) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn_dax *fcd = fm->fc->dax; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_setupmapping_in inarg; loff_t offset = start_idx << FUSE_DAX_SHIFT; FUSE_ARGS(args); ssize_t err; WARN_ON(fcd->nr_free_ranges < 0); /* Ask fuse daemon to setup mapping */ memset(&inarg, 0, sizeof(inarg)); inarg.foffset = offset; inarg.fh = -1; inarg.moffset = dmap->window_offset; inarg.len = FUSE_DAX_SZ; inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; if (writable) inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; args.opcode = FUSE_SETUPMAPPING; args.nodeid = fi->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; err = fuse_simple_request(fm, &args); if (err < 0) return err; dmap->writable = writable; if (!upgrade) { /* * We don't take a reference on inode. inode is valid right now * and when inode is going away, cleanup logic should first * cleanup dmap entries. */ dmap->inode = inode; dmap->itn.start = dmap->itn.last = start_idx; /* Protected by fi->dax->sem */ interval_tree_insert(&dmap->itn, &fi->dax->tree); fi->dax->nr++; spin_lock(&fcd->lock); list_add_tail(&dmap->busy_list, &fcd->busy_ranges); fcd->nr_busy_ranges++; spin_unlock(&fcd->lock); } return 0; } static int fuse_send_removemapping(struct inode *inode, struct fuse_removemapping_in *inargp, struct fuse_removemapping_one *remove_one) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); args.opcode = FUSE_REMOVEMAPPING; args.nodeid = fi->nodeid; args.in_numargs = 3; fuse_set_zero_arg0(&args); args.in_args[1].size = sizeof(*inargp); args.in_args[1].value = inargp; args.in_args[2].size = inargp->count * sizeof(*remove_one); args.in_args[2].value = remove_one; return fuse_simple_request(fm, &args); } static int dmap_removemapping_list(struct inode *inode, unsigned int num, struct list_head *to_remove) { struct fuse_removemapping_one *remove_one, *ptr; struct fuse_removemapping_in inarg; struct fuse_dax_mapping *dmap; int ret, i = 0, nr_alloc; nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); if (!remove_one) return -ENOMEM; ptr = remove_one; list_for_each_entry(dmap, to_remove, list) { ptr->moffset = dmap->window_offset; ptr->len = dmap->length; ptr++; i++; num--; if (i >= nr_alloc || num == 0) { memset(&inarg, 0, sizeof(inarg)); inarg.count = i; ret = fuse_send_removemapping(inode, &inarg, remove_one); if (ret) goto out; ptr = remove_one; i = 0; } } out: kfree(remove_one); return ret; } /* * Cleanup dmap entry and add back to free list. This should be called with * fcd->lock held. */ static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap) { pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", dmap->itn.start, dmap->itn.last, dmap->window_offset, dmap->length); __dmap_remove_busy_list(fcd, dmap); dmap->inode = NULL; dmap->itn.start = dmap->itn.last = 0; __dmap_add_to_free_pool(fcd, dmap); } /* * Free inode dmap entries whose range falls inside [start, end]. * Does not take any locks. At this point of time it should only be * called from evict_inode() path where we know all dmap entries can be * reclaimed. */ static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, struct inode *inode, loff_t start, loff_t end) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap, *n; int err, num = 0; LIST_HEAD(to_remove); unsigned long start_idx = start >> FUSE_DAX_SHIFT; unsigned long end_idx = end >> FUSE_DAX_SHIFT; struct interval_tree_node *node; while (1) { node = interval_tree_iter_first(&fi->dax->tree, start_idx, end_idx); if (!node) break; dmap = node_to_dmap(node); /* inode is going away. There should not be any users of dmap */ WARN_ON(refcount_read(&dmap->refcnt) > 1); interval_tree_remove(&dmap->itn, &fi->dax->tree); num++; list_add(&dmap->list, &to_remove); } /* Nothing to remove */ if (list_empty(&to_remove)) return; WARN_ON(fi->dax->nr < num); fi->dax->nr -= num; err = dmap_removemapping_list(inode, num, &to_remove); if (err && err != -ENOTCONN) { pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", start, end); } spin_lock(&fcd->lock); list_for_each_entry_safe(dmap, n, &to_remove, list) { list_del_init(&dmap->list); dmap_reinit_add_to_free_pool(fcd, dmap); } spin_unlock(&fcd->lock); } static int dmap_removemapping_one(struct inode *inode, struct fuse_dax_mapping *dmap) { struct fuse_removemapping_one forget_one; struct fuse_removemapping_in inarg; memset(&inarg, 0, sizeof(inarg)); inarg.count = 1; memset(&forget_one, 0, sizeof(forget_one)); forget_one.moffset = dmap->window_offset; forget_one.len = dmap->length; return fuse_send_removemapping(inode, &inarg, &forget_one); } /* * It is called from evict_inode() and by that time inode is going away. So * this function does not take any locks like fi->dax->sem for traversing * that fuse inode interval tree. If that lock is taken then lock validator * complains of deadlock situation w.r.t fs_reclaim lock. */ void fuse_dax_inode_cleanup(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); /* * fuse_evict_inode() has already called truncate_inode_pages_final() * before we arrive here. So we should not have to worry about any * pages/exception entries still associated with inode. */ inode_reclaim_dmap_range(fc->dax, inode, 0, -1); WARN_ON(fi->dax->nr); } static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) { iomap->addr = IOMAP_NULL_ADDR; iomap->length = length; iomap->type = IOMAP_HOLE; } static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, struct iomap *iomap, struct fuse_dax_mapping *dmap, unsigned int flags) { loff_t offset, len; loff_t i_size = i_size_read(inode); offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); len = min(length, dmap->length - offset); /* If length is beyond end of file, truncate further */ if (pos + len > i_size) len = i_size - pos; if (len > 0) { iomap->addr = dmap->window_offset + offset; iomap->length = len; if (flags & IOMAP_FAULT) iomap->length = ALIGN(len, PAGE_SIZE); iomap->type = IOMAP_MAPPED; /* * increace refcnt so that reclaim code knows this dmap is in * use. This assumes fi->dax->sem mutex is held either * shared/exclusive. */ refcount_inc(&dmap->refcnt); /* iomap->private should be NULL */ WARN_ON_ONCE(iomap->private); iomap->private = dmap; } else { /* Mapping beyond end of file is hole */ fuse_fill_iomap_hole(iomap, length); } } static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, loff_t length, unsigned int flags, struct iomap *iomap) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; int ret; bool writable = flags & IOMAP_WRITE; unsigned long start_idx = pos >> FUSE_DAX_SHIFT; struct interval_tree_node *node; /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. * In fault path we enter with mapping->invalidate_lock held and can't * drop it. Also in fault path we hold mapping->invalidate_lock shared * and not exclusive, so that creates further issues with * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); if (!alloc_dmap) return -EAGAIN; } else { alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); if (IS_ERR(alloc_dmap)) return PTR_ERR(alloc_dmap); } /* If we are here, we should have memory allocated */ if (WARN_ON(!alloc_dmap)) return -EIO; /* * Take write lock so that only one caller can try to setup mapping * and other waits. */ down_write(&fi->dax->sem); /* * We dropped lock. Check again if somebody else setup * mapping already. */ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); if (node) { dmap = node_to_dmap(node); fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); dmap_add_to_free_pool(fcd, alloc_dmap); up_write(&fi->dax->sem); return 0; } /* Setup one mapping */ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, writable, false); if (ret < 0) { dmap_add_to_free_pool(fcd, alloc_dmap); up_write(&fi->dax->sem); return ret; } fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); up_write(&fi->dax->sem); return 0; } static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, loff_t length, unsigned int flags, struct iomap *iomap) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap; int ret; unsigned long idx = pos >> FUSE_DAX_SHIFT; struct interval_tree_node *node; /* * Take exclusive lock so that only one caller can try to setup * mapping and others wait. */ down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and * re-acquired the fi->dax->sem lock. */ ret = -EIO; if (WARN_ON(!node)) goto out_err; dmap = node_to_dmap(node); /* We took an extra reference on dmap to make sure its not reclaimd. * Now we hold fi->dax->sem lock and that reference is not needed * anymore. Drop it. */ if (refcount_dec_and_test(&dmap->refcnt)) { /* refcount should not hit 0. This object only goes * away when fuse connection goes away */ WARN_ON_ONCE(1); } /* Maybe another thread already upgraded mapping while we were not * holding lock. */ if (dmap->writable) { ret = 0; goto out_fill_iomap; } ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, true); if (ret < 0) goto out_err; out_fill_iomap: fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); out_err: up_write(&fi->dax->sem); return ret; } /* This is just for DAX and the mapping is ephemeral, do not use it for other * purposes since there is no block device with a permanent mapping. */ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_dax_mapping *dmap; bool writable = flags & IOMAP_WRITE; unsigned long start_idx = pos >> FUSE_DAX_SHIFT; struct interval_tree_node *node; /* We don't support FIEMAP */ if (WARN_ON(flags & IOMAP_REPORT)) return -EIO; iomap->offset = pos; iomap->flags = 0; iomap->bdev = NULL; iomap->dax_dev = fc->dax->dev; /* * Both read/write and mmap path can race here. So we need something * to make sure if we are setting up mapping, then other path waits * * For now, use a semaphore for this. It probably needs to be * optimized later. */ down_read(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); if (node) { dmap = node_to_dmap(node); if (writable && !dmap->writable) { /* Upgrade read-only mapping to read-write. This will * require exclusive fi->dax->sem lock as we don't want * two threads to be trying to this simultaneously * for same dmap. So drop shared lock and acquire * exclusive lock. * * Before dropping fi->dax->sem lock, take reference * on dmap so that its not freed by range reclaim. */ refcount_inc(&dmap->refcnt); up_read(&fi->dax->sem); pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", __func__, pos, length); return fuse_upgrade_dax_mapping(inode, pos, length, flags, iomap); } else { fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); up_read(&fi->dax->sem); return 0; } } else { up_read(&fi->dax->sem); pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", __func__, pos, length); if (pos >= i_size_read(inode)) goto iomap_hole; return fuse_setup_new_dax_mapping(inode, pos, length, flags, iomap); } /* * If read beyond end of file happens, fs code seems to return * it as hole */ iomap_hole: fuse_fill_iomap_hole(iomap, length); pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", __func__, pos, length, iomap->length); return 0; } static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { struct fuse_dax_mapping *dmap = iomap->private; if (dmap) { if (refcount_dec_and_test(&dmap->refcnt)) { /* refcount should not hit 0. This object only goes * away when fuse connection goes away */ WARN_ON_ONCE(1); } } /* DAX writes beyond end-of-file aren't handled using iomap, so the * file size is unchanged and there is nothing to do here. */ return 0; } static const struct iomap_ops fuse_iomap_ops = { .iomap_begin = fuse_iomap_begin, .iomap_end = fuse_iomap_end, }; static void fuse_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } /* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { struct page *page; page = dax_layout_busy_page_range(inode->i_mapping, start, end); if (!page) return 0; *retry = true; return ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, fuse_wait_dax_page(inode)); } /* dmap_end == 0 leads to unmapping of whole file */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { bool retry; int ret; do { retry = false; ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, dmap_end); } while (ret == 0 && retry); return ret; } ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); inode_unlock_shared(inode); /* TODO file_accessed(iocb->f_filp) */ return ret; } static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); return (iov_iter_rw(from) == WRITE && ((iocb->ki_pos) >= i_size_read(inode) || (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); } static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t ret; ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, ret); return ret; } ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; ret = file_remove_privs(iocb->ki_filp); if (ret) goto out; /* TODO file_update_time() but we don't want metadata I/O */ /* Do not use dax for file extending writes as write and on * disk i_size increase are not atomic otherwise. */ if (file_extending_write(iocb, from)) ret = fuse_dax_direct_write(iocb, from); else ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; pfn_t pfn; int error = 0; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; bool retry = false; if (write) sb_start_pagefault(sb); retry: if (retry && !(fcd->nr_free_ranges > 0)) wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); /* * We need to serialize against not only truncate but also against * fuse dax memory range reclaim. While a range is being reclaimed, * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); return ret; } static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { .fault = fuse_dax_fault, .huge_fault = fuse_dax_huge_fault, .page_mkwrite = fuse_dax_page_mkwrite, .pfn_mkwrite = fuse_dax_pfn_mkwrite, }; int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } static int dmap_writeback_invalidate(struct inode *inode, struct fuse_dax_mapping *dmap) { int ret; loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); if (ret) { pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", ret, start_pos, end_pos); return ret; } ret = invalidate_inode_pages2_range(inode->i_mapping, start_pos >> PAGE_SHIFT, end_pos >> PAGE_SHIFT); if (ret) pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", ret); return ret; } static int reclaim_one_dmap_locked(struct inode *inode, struct fuse_dax_mapping *dmap) { int ret; struct fuse_inode *fi = get_fuse_inode(inode); /* * igrab() was done to make sure inode won't go under us, and this * further avoids the race with evict(). */ ret = dmap_writeback_invalidate(inode, dmap); if (ret) return ret; /* Remove dax mapping from inode interval tree now */ interval_tree_remove(&dmap->itn, &fi->dax->tree); fi->dax->nr--; /* It is possible that umount/shutdown has killed the fuse connection * and worker thread is trying to reclaim memory in parallel. Don't * warn in that case. */ ret = dmap_removemapping_one(inode, dmap); if (ret && ret != -ENOTCONN) { pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", dmap->window_offset, dmap->length, ret); } return 0; } /* Find first mapped dmap for an inode and return file offset. Caller needs * to hold fi->dax->sem lock either shared or exclusive. */ static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap; struct interval_tree_node *node; for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; node = interval_tree_iter_next(node, 0, -1)) { dmap = node_to_dmap(node); /* still in use. */ if (refcount_read(&dmap->refcnt) > 1) continue; return dmap; } return NULL; } /* * Find first mapping in the tree and free it and return it. Do not add * it back to free pool. */ static struct fuse_dax_mapping * inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, bool *retry) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap; u64 dmap_start, dmap_end; unsigned long start_idx; int ret; struct interval_tree_node *node; filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); dmap = inode_lookup_first_dmap(inode); if (dmap) { start_idx = dmap->itn.start; dmap_start = start_idx << FUSE_DAX_SHIFT; dmap_end = dmap_start + FUSE_DAX_SZ - 1; } up_read(&fi->dax->sem); if (!dmap) goto out_mmap_sem; /* * Make sure there are no references to inode pages using * get_user_pages() */ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", ret); dmap = ERR_PTR(ret); goto out_mmap_sem; } down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); /* Range already got reclaimed by somebody else */ if (!node) { if (retry) *retry = true; goto out_write_dmap_sem; } dmap = node_to_dmap(node); /* still in use. */ if (refcount_read(&dmap->refcnt) > 1) { dmap = NULL; if (retry) *retry = true; goto out_write_dmap_sem; } ret = reclaim_one_dmap_locked(inode, dmap); if (ret < 0) { dmap = ERR_PTR(ret); goto out_write_dmap_sem; } /* Clean up dmap. Do not add back to free list */ dmap_remove_busy_list(fcd, dmap); dmap->inode = NULL; dmap->itn.start = dmap->itn.last = 0; pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", __func__, inode, dmap->window_offset, dmap->length); out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); return dmap; } static struct fuse_dax_mapping * alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) { struct fuse_dax_mapping *dmap; struct fuse_inode *fi = get_fuse_inode(inode); while (1) { bool retry = false; dmap = alloc_dax_mapping(fcd); if (dmap) return dmap; dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); /* * Either we got a mapping or it is an error, return in both * the cases. */ if (dmap) return dmap; /* If we could not reclaim a mapping because it * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise * if a deadlock is possible if we sleep with * mapping->invalidate_lock held and worker to free memory * can't make progress due to unavailability of * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; /* * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding * mapping->invalidate_lock, worker should still be able to * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, (fcd->nr_free_ranges > 0))) { return ERR_PTR(-EINTR); } } } } static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, struct inode *inode, unsigned long start_idx) { int ret; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap; struct interval_tree_node *node; /* Find fuse dax mapping at file offset inode. */ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); /* Range already got cleaned up by somebody else */ if (!node) return 0; dmap = node_to_dmap(node); /* still in use. */ if (refcount_read(&dmap->refcnt) > 1) return 0; ret = reclaim_one_dmap_locked(inode, dmap); if (ret < 0) return ret; /* Cleanup dmap entry and add back to free list */ spin_lock(&fcd->lock); dmap_reinit_add_to_free_pool(fcd, dmap); spin_unlock(&fcd->lock); return ret; } /* * Free a range of memory. * Locking: * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, struct inode *inode, unsigned long start_idx, unsigned long end_idx) { int ret; struct fuse_inode *fi = get_fuse_inode(inode); loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", ret); goto out_mmap_sem; } down_write(&fi->dax->sem); ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); return ret; } static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, unsigned long nr_to_free) { struct fuse_dax_mapping *dmap, *pos, *temp; int ret, nr_freed = 0; unsigned long start_idx = 0, end_idx = 0; struct inode *inode = NULL; /* Pick first busy range and free it for now*/ while (1) { if (nr_freed >= nr_to_free) break; dmap = NULL; spin_lock(&fcd->lock); if (!fcd->nr_busy_ranges) { spin_unlock(&fcd->lock); return 0; } list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, busy_list) { /* skip this range if it's in use. */ if (refcount_read(&pos->refcnt) > 1) continue; inode = igrab(pos->inode); /* * This inode is going away. That will free * up all the ranges anyway, continue to * next range. */ if (!inode) continue; /* * Take this element off list and add it tail. If * this element can't be freed, it will help with * selecting new element in next iteration of loop. */ dmap = pos; list_move_tail(&dmap->busy_list, &fcd->busy_ranges); start_idx = end_idx = dmap->itn.start; break; } spin_unlock(&fcd->lock); if (!dmap) return 0; ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); iput(inode); if (ret) return ret; nr_freed++; } return 0; } static void fuse_dax_free_mem_worker(struct work_struct *work) { int ret; struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, free_work.work); ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); if (ret) { pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", ret); } /* If number of free ranges are still below threshold, requeue */ kick_dmap_free_worker(fcd, 1); } static void fuse_free_dax_mem_ranges(struct list_head *mem_list) { struct fuse_dax_mapping *range, *temp; /* Free All allocated elements */ list_for_each_entry_safe(range, temp, mem_list, list) { list_del(&range->list); if (!list_empty(&range->busy_list)) list_del(&range->busy_list); kfree(range); } } void fuse_dax_conn_free(struct fuse_conn *fc) { if (fc->dax) { fuse_free_dax_mem_ranges(&fc->dax->free_ranges); kfree(fc->dax); fc->dax = NULL; } } static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) { long nr_pages, nr_ranges; struct fuse_dax_mapping *range; int ret, id; size_t dax_size = -1; unsigned long i; init_waitqueue_head(&fcd->range_waitq); INIT_LIST_HEAD(&fcd->free_ranges); INIT_LIST_HEAD(&fcd->busy_ranges); INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), DAX_ACCESS, NULL, NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); return nr_pages; } nr_ranges = nr_pages/FUSE_DAX_PAGES; pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", __func__, nr_pages, nr_ranges); for (i = 0; i < nr_ranges; i++) { range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); ret = -ENOMEM; if (!range) goto out_err; /* TODO: This offset only works if virtio-fs driver is not * having some memory hidden at the beginning. This needs * better handling */ range->window_offset = i * FUSE_DAX_SZ; range->length = FUSE_DAX_SZ; INIT_LIST_HEAD(&range->busy_list); refcount_set(&range->refcnt, 1); list_add_tail(&range->list, &fcd->free_ranges); } fcd->nr_free_ranges = nr_ranges; fcd->nr_ranges = nr_ranges; return 0; out_err: /* Free All allocated elements */ fuse_free_dax_mem_ranges(&fcd->free_ranges); return ret; } int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, struct dax_device *dax_dev) { struct fuse_conn_dax *fcd; int err; fc->dax_mode = dax_mode; if (!dax_dev) return 0; fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); if (!fcd) return -ENOMEM; spin_lock_init(&fcd->lock); fcd->dev = dax_dev; err = fuse_dax_mem_range_init(fcd); if (err) { kfree(fcd); return err; } fc->dax = fcd; return 0; } bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) { struct fuse_conn *fc = get_fuse_conn_super(sb); fi->dax = NULL; if (fc->dax) { fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); if (!fi->dax) return false; init_rwsem(&fi->dax->sem); fi->dax->tree = RB_ROOT_CACHED; } return true; } static const struct address_space_operations fuse_dax_file_aops = { .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, }; static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); enum fuse_dax_mode dax_mode = fc->dax_mode; if (dax_mode == FUSE_DAX_NEVER) return false; /* * fc->dax may be NULL in 'inode' mode when filesystem device doesn't * support DAX, in which case it will silently fallback to 'never' mode. */ if (!fc->dax) return false; if (dax_mode == FUSE_DAX_ALWAYS) return true; /* dax_mode is FUSE_DAX_INODE* */ return fc->inode_dax && (flags & FUSE_ATTR_DAX); } void fuse_dax_inode_init(struct inode *inode, unsigned int flags) { if (!fuse_should_enable_dax(inode, flags)) return; inode->i_flags |= S_DAX; inode->i_data.a_ops = &fuse_dax_file_aops; } void fuse_dax_dontcache(struct inode *inode, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(inode); if (fuse_is_inode_dax_mode(fc->dax_mode) && ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) d_mark_dontcache(inode); } bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", map_alignment, FUSE_DAX_SZ); return false; } return true; } void fuse_dax_cancel_work(struct fuse_conn *fc) { struct fuse_conn_dax *fcd = fc->dax; if (fcd) cancel_delayed_work_sync(&fcd->free_work); } EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 // SPDX-License-Identifier: GPL-2.0 /* * Media device request objects * * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2018 Intel Corporation * Copyright (C) 2018 Google, Inc. * * Author: Hans Verkuil <hansverk@cisco.com> * Author: Sakari Ailus <sakari.ailus@linux.intel.com> */ #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/refcount.h> #include <media/media-device.h> #include <media/media-request.h> static const char * const request_state[] = { [MEDIA_REQUEST_STATE_IDLE] = "idle", [MEDIA_REQUEST_STATE_VALIDATING] = "validating", [MEDIA_REQUEST_STATE_QUEUED] = "queued", [MEDIA_REQUEST_STATE_COMPLETE] = "complete", [MEDIA_REQUEST_STATE_CLEANING] = "cleaning", [MEDIA_REQUEST_STATE_UPDATING] = "updating", }; static const char * media_request_state_str(enum media_request_state state) { BUILD_BUG_ON(ARRAY_SIZE(request_state) != NR_OF_MEDIA_REQUEST_STATE); if (WARN_ON(state >= ARRAY_SIZE(request_state))) return "invalid"; return request_state[state]; } static void media_request_clean(struct media_request *req) { struct media_request_object *obj, *obj_safe; /* Just a sanity check. No other code path is allowed to change this. */ WARN_ON(req->state != MEDIA_REQUEST_STATE_CLEANING); WARN_ON(req->updating_count); WARN_ON(req->access_count); list_for_each_entry_safe(obj, obj_safe, &req->objects, list) { media_request_object_unbind(obj); media_request_object_put(obj); } req->updating_count = 0; req->access_count = 0; WARN_ON(req->num_incomplete_objects); req->num_incomplete_objects = 0; wake_up_interruptible_all(&req->poll_wait); } static void media_request_release(struct kref *kref) { struct media_request *req = container_of(kref, struct media_request, kref); struct media_device *mdev = req->mdev; dev_dbg(mdev->dev, "request: release %s\n", req->debug_str); /* No other users, no need for a spinlock */ req->state = MEDIA_REQUEST_STATE_CLEANING; media_request_clean(req); if (mdev->ops->req_free) mdev->ops->req_free(req); else kfree(req); } void media_request_put(struct media_request *req) { kref_put(&req->kref, media_request_release); } EXPORT_SYMBOL_GPL(media_request_put); static int media_request_close(struct inode *inode, struct file *filp) { struct media_request *req = filp->private_data; media_request_put(req); return 0; } static __poll_t media_request_poll(struct file *filp, struct poll_table_struct *wait) { struct media_request *req = filp->private_data; unsigned long flags; __poll_t ret = 0; if (!(poll_requested_events(wait) & EPOLLPRI)) return 0; poll_wait(filp, &req->poll_wait, wait); spin_lock_irqsave(&req->lock, flags); if (req->state == MEDIA_REQUEST_STATE_COMPLETE) { ret = EPOLLPRI; goto unlock; } if (req->state != MEDIA_REQUEST_STATE_QUEUED) { ret = EPOLLERR; goto unlock; } unlock: spin_unlock_irqrestore(&req->lock, flags); return ret; } static long media_request_ioctl_queue(struct media_request *req) { struct media_device *mdev = req->mdev; enum media_request_state state; unsigned long flags; int ret; dev_dbg(mdev->dev, "request: queue %s\n", req->debug_str); /* * Ensure the request that is validated will be the one that gets queued * next by serialising the queueing process. This mutex is also used * to serialize with canceling a vb2 queue and with setting values such * as controls in a request. */ mutex_lock(&mdev->req_queue_mutex); media_request_get(req); spin_lock_irqsave(&req->lock, flags); if (req->state == MEDIA_REQUEST_STATE_IDLE) req->state = MEDIA_REQUEST_STATE_VALIDATING; state = req->state; spin_unlock_irqrestore(&req->lock, flags); if (state != MEDIA_REQUEST_STATE_VALIDATING) { dev_dbg(mdev->dev, "request: unable to queue %s, request in state %s\n", req->debug_str, media_request_state_str(state)); media_request_put(req); mutex_unlock(&mdev->req_queue_mutex); return -EBUSY; } ret = mdev->ops->req_validate(req); /* * If the req_validate was successful, then we mark the state as QUEUED * and call req_queue. The reason we set the state first is that this * allows req_queue to unbind or complete the queued objects in case * they are immediately 'consumed'. State changes from QUEUED to another * state can only happen if either the driver changes the state or if * the user cancels the vb2 queue. The driver can only change the state * after each object is queued through the req_queue op (and note that * that op cannot fail), so setting the state to QUEUED up front is * safe. * * The other reason for changing the state is if the vb2 queue is * canceled, and that uses the req_queue_mutex which is still locked * while req_queue is called, so that's safe as well. */ spin_lock_irqsave(&req->lock, flags); req->state = ret ? MEDIA_REQUEST_STATE_IDLE : MEDIA_REQUEST_STATE_QUEUED; spin_unlock_irqrestore(&req->lock, flags); if (!ret) mdev->ops->req_queue(req); mutex_unlock(&mdev->req_queue_mutex); if (ret) { dev_dbg(mdev->dev, "request: can't queue %s (%d)\n", req->debug_str, ret); media_request_put(req); } return ret; } static long media_request_ioctl_reinit(struct media_request *req) { struct media_device *mdev = req->mdev; unsigned long flags; spin_lock_irqsave(&req->lock, flags); if (req->state != MEDIA_REQUEST_STATE_IDLE && req->state != MEDIA_REQUEST_STATE_COMPLETE) { dev_dbg(mdev->dev, "request: %s not in idle or complete state, cannot reinit\n", req->debug_str); spin_unlock_irqrestore(&req->lock, flags); return -EBUSY; } if (req->access_count) { dev_dbg(mdev->dev, "request: %s is being accessed, cannot reinit\n", req->debug_str); spin_unlock_irqrestore(&req->lock, flags); return -EBUSY; } req->state = MEDIA_REQUEST_STATE_CLEANING; spin_unlock_irqrestore(&req->lock, flags); media_request_clean(req); spin_lock_irqsave(&req->lock, flags); req->state = MEDIA_REQUEST_STATE_IDLE; spin_unlock_irqrestore(&req->lock, flags); return 0; } static long media_request_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct media_request *req = filp->private_data; switch (cmd) { case MEDIA_REQUEST_IOC_QUEUE: return media_request_ioctl_queue(req); case MEDIA_REQUEST_IOC_REINIT: return media_request_ioctl_reinit(req); default: return -ENOIOCTLCMD; } } static const struct file_operations request_fops = { .owner = THIS_MODULE, .poll = media_request_poll, .unlocked_ioctl = media_request_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = media_request_ioctl, #endif /* CONFIG_COMPAT */ .release = media_request_close, }; struct media_request * media_request_get_by_fd(struct media_device *mdev, int request_fd) { struct media_request *req; if (!mdev || !mdev->ops || !mdev->ops->req_validate || !mdev->ops->req_queue) return ERR_PTR(-EBADR); CLASS(fd, f)(request_fd); if (fd_empty(f)) goto err; if (fd_file(f)->f_op != &request_fops) goto err; req = fd_file(f)->private_data; if (req->mdev != mdev) goto err; /* * Note: as long as someone has an open filehandle of the request, * the request can never be released. The fdget() above ensures that * even if userspace closes the request filehandle, the release() * fop won't be called, so the media_request_get() always succeeds * and there is no race condition where the request was released * before media_request_get() is called. */ media_request_get(req); return req; err: dev_dbg(mdev->dev, "cannot find request_fd %d\n", request_fd); return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(media_request_get_by_fd); int media_request_alloc(struct media_device *mdev, int *alloc_fd) { struct media_request *req; struct file *filp; int fd; int ret; /* Either both are NULL or both are non-NULL */ if (WARN_ON(!mdev->ops->req_alloc ^ !mdev->ops->req_free)) return -ENOMEM; if (mdev->ops->req_alloc) req = mdev->ops->req_alloc(mdev); else req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { ret = fd; goto err_free_req; } filp = anon_inode_getfile("request", &request_fops, NULL, O_CLOEXEC); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_put_fd; } filp->private_data = req; req->mdev = mdev; req->state = MEDIA_REQUEST_STATE_IDLE; req->num_incomplete_objects = 0; kref_init(&req->kref); INIT_LIST_HEAD(&req->objects); spin_lock_init(&req->lock); init_waitqueue_head(&req->poll_wait); req->updating_count = 0; req->access_count = 0; *alloc_fd = fd; snprintf(req->debug_str, sizeof(req->debug_str), "%u:%d", atomic_inc_return(&mdev->request_id), fd); dev_dbg(mdev->dev, "request: allocated %s\n", req->debug_str); fd_install(fd, filp); return 0; err_put_fd: put_unused_fd(fd); err_free_req: if (mdev->ops->req_free) mdev->ops->req_free(req); else kfree(req); return ret; } static void media_request_object_release(struct kref *kref) { struct media_request_object *obj = container_of(kref, struct media_request_object, kref); struct media_request *req = obj->req; if (WARN_ON(req)) media_request_object_unbind(obj); obj->ops->release(obj); } struct media_request_object * media_request_object_find(struct media_request *req, const struct media_request_object_ops *ops, void *priv) { struct media_request_object *obj; struct media_request_object *found = NULL; unsigned long flags; if (WARN_ON(!ops || !priv)) return NULL; spin_lock_irqsave(&req->lock, flags); list_for_each_entry(obj, &req->objects, list) { if (obj->ops == ops && obj->priv == priv) { media_request_object_get(obj); found = obj; break; } } spin_unlock_irqrestore(&req->lock, flags); return found; } EXPORT_SYMBOL_GPL(media_request_object_find); void media_request_object_put(struct media_request_object *obj) { kref_put(&obj->kref, media_request_object_release); } EXPORT_SYMBOL_GPL(media_request_object_put); void media_request_object_init(struct media_request_object *obj) { obj->ops = NULL; obj->req = NULL; obj->priv = NULL; obj->completed = false; INIT_LIST_HEAD(&obj->list); kref_init(&obj->kref); } EXPORT_SYMBOL_GPL(media_request_object_init); int media_request_object_bind(struct media_request *req, const struct media_request_object_ops *ops, void *priv, bool is_buffer, struct media_request_object *obj) { unsigned long flags; int ret = -EBUSY; if (WARN_ON(!ops->release)) return -EBADR; spin_lock_irqsave(&req->lock, flags); if (WARN_ON(req->state != MEDIA_REQUEST_STATE_UPDATING && req->state != MEDIA_REQUEST_STATE_QUEUED)) goto unlock; obj->req = req; obj->ops = ops; obj->priv = priv; if (is_buffer) list_add_tail(&obj->list, &req->objects); else list_add(&obj->list, &req->objects); req->num_incomplete_objects++; ret = 0; unlock: spin_unlock_irqrestore(&req->lock, flags); return ret; } EXPORT_SYMBOL_GPL(media_request_object_bind); void media_request_object_unbind(struct media_request_object *obj) { struct media_request *req = obj->req; unsigned long flags; bool completed = false; if (WARN_ON(!req)) return; spin_lock_irqsave(&req->lock, flags); list_del(&obj->list); obj->req = NULL; if (req->state == MEDIA_REQUEST_STATE_COMPLETE) goto unlock; if (WARN_ON(req->state == MEDIA_REQUEST_STATE_VALIDATING)) goto unlock; if (req->state == MEDIA_REQUEST_STATE_CLEANING) { if (!obj->completed) req->num_incomplete_objects--; goto unlock; } if (WARN_ON(!req->num_incomplete_objects)) goto unlock; req->num_incomplete_objects--; if (req->state == MEDIA_REQUEST_STATE_QUEUED && !req->num_incomplete_objects) { req->state = MEDIA_REQUEST_STATE_COMPLETE; completed = true; wake_up_interruptible_all(&req->poll_wait); } unlock: spin_unlock_irqrestore(&req->lock, flags); if (obj->ops->unbind) obj->ops->unbind(obj); if (completed) media_request_put(req); } EXPORT_SYMBOL_GPL(media_request_object_unbind); void media_request_object_complete(struct media_request_object *obj) { struct media_request *req = obj->req; unsigned long flags; bool completed = false; spin_lock_irqsave(&req->lock, flags); if (obj->completed) goto unlock; obj->completed = true; if (WARN_ON(!req->num_incomplete_objects) || WARN_ON(req->state != MEDIA_REQUEST_STATE_QUEUED)) goto unlock; if (!--req->num_incomplete_objects) { req->state = MEDIA_REQUEST_STATE_COMPLETE; wake_up_interruptible_all(&req->poll_wait); completed = true; } unlock: spin_unlock_irqrestore(&req->lock, flags); if (completed) media_request_put(req); } EXPORT_SYMBOL_GPL(media_request_object_complete);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 * * Network memory * * Author: Mina Almasry <almasrymina@google.com> */ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H #include <linux/mm.h> #include <net/net_debug.h> /* net_iov */ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); /* We overload the LSB of the struct page pointer to indicate whether it's * a page or net_iov. */ #define NET_IOV 0x01UL struct net_iov { unsigned long __unused_padding; unsigned long pp_magic; struct page_pool *pp; struct dmabuf_genpool_chunk_owner *owner; unsigned long dma_addr; atomic_long_t pp_ref_count; }; /* These fields in struct page are used by the page_pool and net stack: * * struct { * unsigned long pp_magic; * struct page_pool *pp; * unsigned long _pp_mapping_pad; * unsigned long dma_addr; * atomic_long_t pp_ref_count; * }; * * We mirror the page_pool fields here so the page_pool can access these fields * without worrying whether the underlying fields belong to a page or net_iov. * * The non-net stack fields of struct page are private to the mm stack and must * never be mirrored to net_iov. */ #define NET_IOV_ASSERT_OFFSET(pg, iov) \ static_assert(offsetof(struct page, pg) == \ offsetof(struct net_iov, iov)) NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); NET_IOV_ASSERT_OFFSET(pp, pp); NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET /* netmem */ /** * typedef netmem_ref - a nonexistent type marking a reference to generic * network memory. * * A netmem_ref currently is always a reference to a struct page. This * abstraction is introduced so support for new memory types can be added. * * Use the supplied helpers to obtain the underlying memory pointer and fields. */ typedef unsigned long __bitwise netmem_ref; static inline bool netmem_is_net_iov(const netmem_ref netmem) { return (__force unsigned long)netmem & NET_IOV; } /** * __netmem_to_page - unsafely get pointer to the &page backing @netmem * @netmem: netmem reference to convert * * Unsafe version of netmem_to_page(). When @netmem is always page-backed, * e.g. when it's a header buffer, performs faster and generates smaller * object code (no check for the LSB, no WARN). When @netmem points to IOV, * provokes undefined behaviour. * * Return: pointer to the &page (garbage if @netmem is not page-backed). */ static inline struct page *__netmem_to_page(netmem_ref netmem) { return (__force struct page *)netmem; } /* This conversion fails (returns NULL) if the netmem_ref is not struct page * backed. */ static inline struct page *netmem_to_page(netmem_ref netmem) { if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return NULL; return __netmem_to_page(netmem); } static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); DEBUG_NET_WARN_ON_ONCE(true); return NULL; } static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) { return (__force netmem_ref)((unsigned long)niov | NET_IOV); } static inline netmem_ref page_to_netmem(struct page *page) { return (__force netmem_ref)page; } /** * virt_to_netmem - convert virtual memory pointer to a netmem reference * @data: host memory pointer to convert * * Return: netmem reference to the &page backing this virtual address. */ static inline netmem_ref virt_to_netmem(const void *data) { return page_to_netmem(virt_to_page(data)); } static inline int netmem_ref_count(netmem_ref netmem) { /* The non-pp refcount of net_iov is always 1. On net_iov, we only * support pp refcounting which uses the pp_ref_count field. */ if (netmem_is_net_iov(netmem)) return 1; return page_ref_count(netmem_to_page(netmem)); } static inline unsigned long netmem_pfn_trace(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return 0; return page_to_pfn(netmem_to_page(netmem)); } static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) { return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } /** * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem * @netmem: netmem reference to get the pointer from * * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, * e.g. when it's a header buffer, performs faster and generates smaller * object code (avoids clearing the LSB). When @netmem points to IOV, * provokes invalid memory access. * * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). */ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) { return __netmem_to_page(netmem)->pp; } static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { return __netmem_clear_lsb(netmem)->pp; } static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) { return &__netmem_clear_lsb(netmem)->pp_ref_count; } static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) { /* NUMA node preference only makes sense if we're allocating * system memory. Memory providers (which give us net_iovs) * choose for us. */ if (netmem_is_net_iov(netmem)) return true; return page_to_nid(netmem_to_page(netmem)) == pref_nid; } static inline netmem_ref netmem_compound_head(netmem_ref netmem) { /* niov are never compounded */ if (netmem_is_net_iov(netmem)) return netmem; return page_to_netmem(compound_head(netmem_to_page(netmem))); } /** * __netmem_address - unsafely get pointer to the memory backing @netmem * @netmem: netmem reference to get the pointer for * * Unsafe version of netmem_address(). When @netmem is always page-backed, * e.g. when it's a header buffer, performs faster and generates smaller * object code (no check for the LSB). When @netmem points to IOV, provokes * undefined behaviour. * * Return: pointer to the memory (garbage if @netmem is not page-backed). */ static inline void *__netmem_address(netmem_ref netmem) { return page_address(__netmem_to_page(netmem)); } static inline void *netmem_address(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return NULL; return __netmem_address(netmem); } /** * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure * @netmem: netmem reference to check * * Return: true if @netmem is page-backed and the page was allocated under * memory pressure, false otherwise. */ static inline bool netmem_is_pfmemalloc(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return false; return page_is_pfmemalloc(netmem_to_page(netmem)); } static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) { return __netmem_clear_lsb(netmem)->dma_addr; } #endif /* _NET_NETMEM_H */
2029 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <trace/events/net_probe_common.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strscpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
17 17 17 17 17 3 14 13 2 16 1 16 16 17 1 1 16 1 2 14 1 1 1 1 1 3 2 1 2 14 16 17 17 17 15 4 4 1 3 2 2 2 2 1 2 3 2 2 1 4 2 4 4 4 4 2 2 15 2 15 126 91 11 49 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 2 2 2 1 1 16 16 1 16 7 14 15 2 2 1 1 2 14 16 15 23 23 23 23 5 1 4 4 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/module.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> #include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> static struct kmem_cache *victim_entry_slab; static unsigned int count_bits(const unsigned long *addr, unsigned int offset, unsigned int len); static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .should_migrate_blocks = false, .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; set_freezable(); do { bool sync_mode, foreground = false; wait_event_freezable_timeout(*wq, kthread_should_stop() || waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) foreground = true; /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = false; if (f2fs_readonly(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } if (kthread_should_stop()) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { increase_sleep_time(gc_th, &wait_ms); stat_other_skip_bggc_count(sbi); continue; } if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); if (!sb_start_write_trylock(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } gc_control.one_time = false; /* * [GC triggering condition] * 0. GC is not conducted currently. * 1. There are enough dirty segments. * 2. IO subsystem is idle by checking the # of writeback pages. * 3. IO subsystem is idle by checking the # of requests in * bdev's request list. * * Note) We have to avoid triggering GCs frequently. * Because it is possible that some segments can be * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; f2fs_down_write(&sbi->gc_lock); goto do_gc; } if (foreground) { f2fs_down_write(&sbi->gc_lock); goto do_gc; } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); f2fs_up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } if (f2fs_sb_has_blkzoned(sbi)) { if (has_enough_free_blocks(sbi, gc_th->no_zoned_gc_percent)) { wait_ms = gc_th->no_gc_sleep_time; f2fs_up_write(&sbi->gc_lock); goto next; } if (wait_ms == gc_th->no_gc_sleep_time) wait_ms = gc_th->max_sleep_time; } if (need_to_boost_gc(sbi)) { decrease_sleep_time(gc_th, &wait_ms); if (f2fs_sb_has_blkzoned(sbi)) gc_control.one_time = true; } else { increase_sleep_time(gc_th, &wait_ms); } do_gc: stat_inc_gc_call_count(sbi, foreground ? FOREGROUND : BACKGROUND); sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || gc_control.one_time; /* foreground GC was been triggered via f2fs_balance_fs() */ if (foreground) sync_mode = false; gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) { /* don't bother wait_ms by foreground gc */ if (!foreground) wait_ms = gc_th->no_gc_sleep_time; } else { /* reset wait_ms to default sleep time */ if (wait_ms == gc_th->no_gc_sleep_time) wait_ms = gc_th->min_sleep_time; } if (foreground) wake_up_all(&gc_th->fggc_wq); trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: if (sbi->gc_mode != GC_NORMAL) { spin_lock(&sbi->gc_remaining_trials_lock); if (sbi->gc_remaining_trials) { sbi->gc_remaining_trials--; if (!sbi->gc_remaining_trials) sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_remaining_trials_lock); } sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; } int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; if (f2fs_sb_has_blkzoned(sbi)) { gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED; gc_th->no_zoned_gc_percent = LIMIT_NO_ZONED_GC; gc_th->boost_zoned_gc_percent = LIMIT_BOOST_ZONED_GC; } else { gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->no_zoned_gc_percent = 0; gc_th->boost_zoned_gc_percent = 0; } gc_th->gc_wake = false; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { int err = PTR_ERR(gc_th->f2fs_gc_task); kfree(gc_th); sbi->gc_thread = NULL; return err; } return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { int gc_mode; if (gc_type == BG_GC) { if (sbi->am.atgc_enabled) gc_mode = GC_AT; else gc_mode = GC_CB; } else { gc_mode = GC_GREEDY; } switch (sbi->gc_mode) { case GC_IDLE_CB: case GC_URGENT_LOW: case GC_URGENT_MID: gc_mode = GC_CB; break; case GC_IDLE_GREEDY: case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; case GC_IDLE_AT: gc_mode = GC_AT; break; } return gc_mode; } static void select_policy(struct f2fs_sb_info *sbi, int gc_type, int type, struct victim_sel_policy *p) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else if (p->alloc_mode == AT_SSR) { p->gc_mode = GC_GREEDY; p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); p->ofs_unit = SEGS_PER_SEC(sbi); if (__is_large_section(sbi)) { p->dirty_bitmap = dirty_i->dirty_secmap; p->max_search = count_bits(p->dirty_bitmap, 0, MAIN_SECS(sbi)); } else { p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; p->max_search = dirty_i->nr_dirty[DIRTY]; } } /* * adjust candidates range, should select all dirty segments for * foreground GC and urgent GC cases. */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT_HIGH) && (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first. */ if (f2fs_need_rand_seg(sbi)) p->offset = get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); else if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) return BLKS_PER_SEG(sbi); else if (p->alloc_mode == AT_SSR) return UINT_MAX; /* LFS */ if (p->gc_mode == GC_GREEDY) return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit); else if (p->gc_mode == GC_CB) return UINT_MAX; else if (p->gc_mode == GC_AT) return UINT_MAX; else /* No other gc_mode */ return 0; } static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int secno; /* * If the gc_type is FG_GC, we can select victim segments * selected by background GC before. * Those segments guarantee they have small valid blocks. */ for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; clear_bit(secno, dirty_i->victim_secmap); return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; unsigned char u; unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); mtime = f2fs_get_section_mtime(sbi, segno); f2fs_bug_on(sbi, mtime == INVALID_MTIME); vblocks = get_valid_blocks(sbi, segno, true); vblocks = div_u64(vblocks, usable_segs_per_sec); u = BLKS_TO_SEGS(sbi, vblocks * 100); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) sit_i->min_mtime = mtime; if (mtime > sit_i->max_mtime) sit_i->max_mtime = mtime; if (sit_i->max_mtime != sit_i->min_mtime) age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), sit_i->max_mtime - sit_i->min_mtime); return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >= CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio / 100)) return UINT_MAX; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) return get_valid_blocks(sbi, segno, true); else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); f2fs_bug_on(sbi, 1); return 0; } static unsigned int count_bits(const unsigned long *addr, unsigned int offset, unsigned int len) { unsigned int end = offset + len, sum = 0; while (offset < end) { if (test_bit(offset++, addr)) ++sum; } return sum; } static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, struct rb_root_cached *root) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; struct victim_entry *cur_ve, *next_ve; while (cur) { next = rb_next(cur); if (!next) return true; cur_ve = rb_entry(cur, struct victim_entry, rb_node); next_ve = rb_entry(next, struct victim_entry, rb_node); if (cur_ve->mtime > next_ve->mtime) { f2fs_info(sbi, "broken victim_rbtree, " "cur_mtime(%llu) next_mtime(%llu)", cur_ve->mtime, next_ve->mtime); return false; } cur = next; } #endif return true; } static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime) { struct atgc_management *am = &sbi->am; struct rb_node *node = am->root.rb_root.rb_node; struct victim_entry *ve = NULL; while (node) { ve = rb_entry(node, struct victim_entry, rb_node); if (mtime < ve->mtime) node = node->rb_left; else node = node->rb_right; } return ve; } static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; struct victim_entry *ve; ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; list_add_tail(&ve->list, &am->victim_list); am->victim_count++; return ve; } static void __insert_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; struct rb_root_cached *root = &am->root; struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; struct victim_entry *ve; bool left_most = true; /* look up rb tree to find parent node */ while (*p) { parent = *p; ve = rb_entry(parent, struct victim_entry, rb_node); if (mtime < ve->mtime) { p = &(*p)->rb_left; } else { p = &(*p)->rb_right; left_most = false; } } ve = __create_victim_entry(sbi, mtime, segno); rb_link_node(&ve->rb_node, parent, p); rb_insert_color_cached(&ve->rb_node, root, left_most); } static void add_victim_entry(struct f2fs_sb_info *sbi, struct victim_sel_policy *p, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); unsigned long long mtime = 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (p->gc_mode == GC_AT && get_valid_blocks(sbi, segno, true) == 0) return; } mtime = f2fs_get_section_mtime(sbi, segno); f2fs_bug_on(sbi, mtime == INVALID_MTIME); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) sit_i->min_mtime = mtime; if (mtime > sit_i->max_mtime) sit_i->max_mtime = mtime; if (mtime < sit_i->dirty_min_mtime) sit_i->dirty_min_mtime = mtime; if (mtime > sit_i->dirty_max_mtime) sit_i->dirty_max_mtime = mtime; /* don't choose young section as candidate */ if (sit_i->dirty_max_mtime - mtime < p->age_threshold) return; __insert_victim_entry(sbi, mtime, segno); } static void atgc_lookup_victim(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { struct sit_info *sit_i = SIT_I(sbi); struct atgc_management *am = &sbi->am; struct rb_root_cached *root = &am->root; struct rb_node *node; struct victim_entry *ve; unsigned long long total_time; unsigned long long age, u, accu; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * am->victim_count / 100); unsigned int age_weight = am->age_weight; unsigned int cost; unsigned int iter = 0; if (max_mtime < min_mtime) return; max_mtime += 1; total_time = max_mtime - min_mtime; accu = div64_u64(ULLONG_MAX, total_time); accu = min_t(unsigned long long, div_u64(accu, 100), DEFAULT_ACCURACY_CLASS); node = rb_first_cached(root); next: ve = rb_entry_safe(node, struct victim_entry, rb_node); if (!ve) return; if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip; /* age = 10000 * x% * 60 */ age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * age_weight; vblocks = get_valid_blocks(sbi, ve->segno, true); f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); /* u = 10000 * x% * 40 */ u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * (100 - age_weight); f2fs_bug_on(sbi, age + u >= UINT_MAX); cost = UINT_MAX - (age + u); iter++; if (cost < p->min_cost || (cost == p->min_cost && age > p->oldest_age)) { p->min_cost = cost; p->oldest_age = age; p->min_segno = ve->segno; } skip: if (iter < dirty_threshold) { node = rb_next(node); goto next; } } /* * select candidates around source section in range of * [target - dirty_threshold, target + dirty_threshold] */ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { struct sit_info *sit_i = SIT_I(sbi); struct atgc_management *am = &sbi->am; struct victim_entry *ve; unsigned long long age; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * am->victim_count / 100); unsigned int cost, iter; int stage = 0; if (max_mtime < min_mtime) return; max_mtime += 1; next_stage: iter = 0; ve = __lookup_victim_entry(sbi, p->age); next_node: if (!ve) { if (stage++ == 0) goto next_stage; return; } if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip_node; age = max_mtime - ve->mtime; vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; f2fs_bug_on(sbi, !vblocks); /* rare case */ if (vblocks == BLKS_PER_SEG(sbi)) goto skip_node; iter++; age = max_mtime - abs(p->age - age); cost = UINT_MAX - vblocks; if (cost < p->min_cost || (cost == p->min_cost && age > p->oldest_age)) { p->min_cost = cost; p->oldest_age = age; p->min_segno = ve->segno; } skip_node: if (iter < dirty_threshold) { ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : rb_next(&ve->rb_node), struct victim_entry, rb_node); goto next_node; } if (stage++ == 0) goto next_stage; } static void lookup_victim_by_age(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); if (p->gc_mode == GC_AT) atgc_lookup_victim(sbi, p); else if (p->alloc_mode == AT_SSR) atssr_lookup_victim(sbi, p); else f2fs_bug_on(sbi, 1); } static void release_victim_entry(struct f2fs_sb_info *sbi) { struct atgc_management *am = &sbi->am; struct victim_entry *ve, *tmp; list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { list_del(&ve->list); kmem_cache_free(victim_entry_slab, ve); am->victim_count--; } am->root = RB_ROOT_CACHED; f2fs_bug_on(sbi, am->victim_count); f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!dirty_i->enable_pin_section) return false; if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) dirty_i->pinned_secmap_cnt++; return true; } static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) { return dirty_i->pinned_secmap_cnt; } static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, unsigned int secno) { return dirty_i->enable_pin_section && f2fs_pinned_section_exists(dirty_i) && test_bit(secno, dirty_i->pinned_secmap); } static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) { unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); DIRTY_I(sbi)->pinned_secmap_cnt = 0; } DIRTY_I(sbi)->enable_pin_section = enable; } static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, unsigned int segno) { if (!f2fs_is_pinned_file(inode)) return 0; if (gc_type != FG_GC) return -EBUSY; if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) f2fs_pin_file_control(inode, true); return -EAGAIN; } /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment * and it does not remove it from dirty seglist. * When it is called from SSR segment selection, it finds a segment * which has minimum valid blocks and removes it from dirty seglist. */ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode, unsigned long long age, bool one_time) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched; bool is_atgc; int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); p.alloc_mode = alloc_mode; p.age = age; p.age_threshold = sbi->am.age_threshold; p.one_time_gc = one_time; retry: select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; p.oldest_age = 0; p.min_cost = get_max_cost(sbi, &p); is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); nsearched = 0; if (is_atgc) SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; if (*result != NULL_SEGNO) { if (!get_valid_blocks(sbi, *result, false)) { ret = -ENODATA; goto out; } if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) { ret = -EBUSY; goto out; } if (gc_type == FG_GC) clear_bit(GET_SEC_FROM_SEG(sbi, *result), dirty_i->victim_secmap); p.min_segno = *result; goto got_result; } ret = -ENODATA; if (p.max_search == 0) goto out; if (__is_large_section(sbi) && p.alloc_mode == LFS) { if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { p.min_segno = sbi->next_victim_seg[BG_GC]; *result = p.min_segno; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; goto got_result; } if (gc_type == FG_GC && sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { p.min_segno = sbi->next_victim_seg[FG_GC]; *result = p.min_segno; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; goto got_result; } } last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) goto got_it; } while (1) { unsigned long cost, *dirty_bitmap; unsigned int unit_no, segno; dirty_bitmap = p.dirty_bitmap; unit_no = find_next_bit(dirty_bitmap, last_segment / p.ofs_unit, p.offset / p.ofs_unit); segno = unit_no * p.ofs_unit; if (segno >= last_segment) { if (sm->last_victim[p.gc_mode]) { last_segment = sm->last_victim[p.gc_mode]; sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } break; } p.offset = segno + p.ofs_unit; nsearched++; #ifdef CONFIG_F2FS_CHECK_FS /* * skip selecting the invalid segno (that is failed due to block * validity check failure during GC) to avoid endless GC loop in * such cases. */ if (test_bit(segno, sm->invalid_segmap)) goto next; #endif secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; /* Don't touch checkpointed data */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (p.alloc_mode == LFS) { /* * LFS is set to find source section during GC. * The victim should have no checkpointed data. */ if (get_ckpt_valid_blocks(sbi, segno, true)) goto next; } else { /* * SSR | AT_SSR are set to find target segment * for writes which can be full by checkpointed * and newly written blocks. */ if (!f2fs_segment_has_free_slot(sbi, segno)) goto next; } } if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) goto next; if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; } cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; } next: if (nsearched >= p.max_search) { if (!sm->last_victim[p.gc_mode] && segno <= last_victim) sm->last_victim[p.gc_mode] = last_victim + p.ofs_unit; else sm->last_victim[p.gc_mode] = segno + p.ofs_unit; sm->last_victim[p.gc_mode] %= (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); break; } } /* get victim for GC_AT/AT_SSR */ if (is_atgc) { lookup_victim_by_age(sbi, &p); release_victim_entry(sbi); } if (is_atgc && p.min_segno == NULL_SEGNO && sm->elapsed_time < p.age_threshold) { p.age_threshold = 0; goto retry; } if (p.min_segno != NULL_SEGNO) { got_it: *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; got_result: if (p.alloc_mode == LFS) { secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else set_bit(secno, dirty_i->victim_secmap); } ret = 0; } out: if (p.min_segno != NULL_SEGNO) trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); mutex_unlock(&dirty_i->seglist_lock); return ret; } static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) { struct inode_entry *ie; ie = radix_tree_lookup(&gc_list->iroot, ino); if (ie) return ie->inode; return NULL; } static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) { struct inode_entry *new_ie; if (inode == find_gc_inode(gc_list, inode->i_ino)) { iput(inode); return; } new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS, true, NULL); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); list_add_tail(&new_ie->list, &gc_list->ilist); } static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); kmem_cache_free(f2fs_inode_entry_slab, ie); } } static int check_valid_map(struct f2fs_sb_info *sbi, unsigned int segno, int offset) { struct sit_info *sit_i = SIT_I(sbi); struct seg_entry *sentry; int ret; down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); up_read(&sit_i->sentry_lock); return ret; } /* * This function compares node address got in summary with that in NAT. * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { struct f2fs_summary *entry; block_t start_addr; int off; int phase = 0; bool fggc = (gc_type == FG_GC); int submitted = 0; unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; if (fggc && phase == 2) atomic_inc(&sbi->wb_sync_req[NODE]); for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; int err; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { f2fs_ra_node_page(sbi, nid); continue; } /* phase == 2 */ node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; /* block may become invalid during f2fs_get_node_page */ if (check_valid_map(sbi, segno, off) == 0) { f2fs_put_page(node_page, 1); continue; } if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; } err = f2fs_move_node_page(node_page, gc_type); if (!err && gc_type == FG_GC) submitted++; stat_inc_node_blk_count(sbi, 1, gc_type); } if (++phase < 3) goto next_step; if (fggc) atomic_dec(&sbi->wb_sync_req[NODE]); return submitted; } /* * Calculate start block index indicating the given node offset. * Be careful, caller should give this node offset only indicating direct node * blocks. If any node offsets, which point the other types of node blocks such * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; if (node_ofs == 0) return 0; if (node_ofs <= 2) { bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); } static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { struct page *node_page; nid_t nid; unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return false; if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } if (sum->version != dni->version) { f2fs_warn(sbi, "%s: valid data with mismatched node version.", __func__); set_sbi_flag(sbi, SBI_NEED_FSCK); } if (f2fs_check_nid_range(sbi, dni->ino)) { f2fs_put_page(node_page, 1); return false; } if (IS_INODE(node_page)) { base = offset_in_addr(F2FS_INODE(node_page)); max_addrs = DEF_ADDRS_PER_INODE; } else { base = 0; max_addrs = DEF_ADDRS_PER_BLOCK; } if (base + ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", base, ofs_in_node, max_addrs, dni->ino, dni->nid); f2fs_put_page(node_page, 1); return false; } *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) { #ifdef CONFIG_F2FS_CHECK_FS unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (unlikely(check_valid_map(sbi, segno, offset))) { if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", blkaddr, source_blkaddr, segno); set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif return false; } return true; } static int ra_data_block(struct inode *inode, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct address_space *mapping = f2fs_is_cow_file(inode) ? F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; struct dnode_of_data dn; struct page *page; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, .in_list = 0, }; int err; page = f2fs_grab_cache_page(mapping, index, true); if (!page) return -ENOMEM; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; goto put_page; } goto got_it; } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) goto put_page; f2fs_put_dnode(&dn); if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { err = -ENOENT; goto put_page; } if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; goto put_page; } got_it: /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; /* * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ f2fs_wait_on_page_writeback(page, DATA, true, true); f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), dn.data_blkaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; goto put_page; } err = f2fs_submit_page_bio(&fio); if (err) goto put_encrypted_page; f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: f2fs_put_page(fio.encrypted_page, 1); put_page: f2fs_put_page(page, 1); return err; } /* * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. */ static int move_data_block(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct address_space *mapping = f2fs_is_cow_file(inode) ? F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, .op_flags = 0, .encrypted_page = NULL, .in_list = 0, }; struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; struct page *page, *mpage; block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && (fio.sbi->gc_mode != GC_URGENT_HIGH) ? CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ page = f2fs_grab_cache_page(mapping, bidx, false); if (!page) return -ENOMEM; if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; goto out; } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) goto out; if (unlikely(dn.data_blkaddr == NULL_ADDR)) { ClearPageUptodate(page); err = -ENOENT; goto put_out; } /* * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ f2fs_wait_on_page_writeback(page, DATA, true, true); f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) f2fs_down_write(&fio.sbi->io_order_lock); mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); if (!mpage) { err = -ENOMEM; goto up_out; } fio.encrypted_page = mpage; /* read source block in mpage */ if (!PageUptodate(mpage)) { err = f2fs_submit_page_bio(&fio); if (err) { f2fs_put_page(mpage, 1); goto up_out; } f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || !PageUptodate(mpage))) { err = -EIO; f2fs_put_page(mpage, 1); goto up_out; } } set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* allocate block address */ err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); if (err) { f2fs_put_page(mpage, 1); /* filesystem should shutdown, no need to recovery block */ goto up_out; } fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; f2fs_put_page(mpage, 1); goto recover_block; } /* write target block */ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); memcpy(page_address(fio.encrypted_page), page_address(mpage), PAGE_SIZE); f2fs_put_page(mpage, 1); f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); f2fs_put_page(fio.encrypted_page, 1); recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true, true); up_out: if (lfs_mode) f2fs_up_write(&fio.sbi->io_order_lock); put_out: f2fs_put_dnode(&dn); out: f2fs_put_page(page, 1); return err; } static int move_data_page(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct page *page; int err = 0; page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return PTR_ERR(page); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; goto out; } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; if (gc_type == BG_GC) { if (folio_test_writeback(page_folio(page))) { err = -EAGAIN; goto out; } set_page_dirty(page); set_page_private_gcing(page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); retry: f2fs_wait_on_page_writeback(page, DATA, true, true); set_page_dirty(page); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } set_page_private_gcing(page); err = f2fs_do_write_data_page(&fio); if (err) { clear_page_private_gcing(page); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) set_page_dirty(page); } } out: f2fs_put_page(page, 1); return err; } /* * This function tries to get parent node of victim data block, and identifies * data block validity. If the block is valid, copy that with cold status and * modify parent node. * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type, bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; block_t start_addr; int off; int phase = 0; int submitted = 0; unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; nid_t nid = le32_to_cpu(entry->nid); /* * stop BG_GC if there is not enough free sections. * Or, stop GC if the segment becomes fully valid caused by * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || (!force_migrate && get_valid_blocks(sbi, segno, true) == CAP_BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { f2fs_ra_node_page(sbi, nid); continue; } /* Get an inode by ino with checking validity */ if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; if (phase == 2) { f2fs_ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { int err; inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode)) continue; if (is_bad_inode(inode) || special_file(inode->i_mode)) { iput(inode); continue; } if (f2fs_has_inline_data(inode)) { iput(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err_ratelimited(sbi, "inode %lx has both inline_data flag and " "data block, nid=%u, ofs_in_node=%u", inode->i_ino, dni.nid, ofs_in_node); continue; } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err == -EAGAIN) { iput(inode); return submitted; } if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); sbi->skipped_gc_rwsem++; continue; } start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_meta_inode_gc_required(inode)) { int err = ra_data_block(inode, start_bidx); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) { iput(inode); continue; } add_gc_inode(gc_list, inode); continue; } data_page = f2fs_get_read_data_page(inode, start_bidx, REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; } f2fs_put_page(data_page, 0); add_gc_inode(gc_list, inode); continue; } /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { struct f2fs_inode_info *fi = F2FS_I(inode); bool locked = false; int err; if (S_ISREG(inode->i_mode)) { if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; continue; } if (!f2fs_down_write_trylock( &fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; f2fs_up_write(&fi->i_gc_rwsem[WRITE]); continue; } locked = true; /* wait for all inflight aio data */ inode_dio_wait(inode); } start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_meta_inode_gc_required(inode)) err = move_data_block(inode, start_bidx, gc_type, segno, off); else err = move_data_page(inode, start_bidx, gc_type, segno, off); if (!err && (gc_type == FG_GC || f2fs_meta_inode_gc_required(inode))) submitted++; if (locked) { f2fs_up_write(&fi->i_gc_rwsem[READ]); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); } stat_inc_data_blk_count(sbi, 1, gc_type); } } if (++phase < 5) goto next_step; return submitted; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, int gc_type, bool one_time) { struct sit_info *sit_i = SIT_I(sbi); int ret; down_write(&sit_i->sentry_lock); ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS, 0, one_time); up_write(&sit_i->sentry_lock); return ret; } static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, struct gc_inode_list *gc_list, int gc_type, bool force_migrate, bool one_time) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi); unsigned int sec_end_segno; int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; int submitted = 0; if (__is_large_section(sbi)) { sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi)); /* * zone-capacity can be less than zone-size in zoned devices, * resulting in less than expected usable segments in the zone, * calculate the end segno in the zone which can be garbage * collected */ if (f2fs_sb_has_blkzoned(sbi)) sec_end_segno -= SEGS_PER_SEC(sbi) - f2fs_usable_segs_in_sec(sbi); if (gc_type == BG_GC || one_time) { unsigned int window_granularity = sbi->migration_window_granularity; if (f2fs_sb_has_blkzoned(sbi) && !has_enough_free_blocks(sbi, sbi->gc_thread->boost_zoned_gc_percent)) window_granularity *= BOOST_GC_MULTIPLE; end_segno = start_segno + window_granularity; } if (end_segno > sec_end_segno) end_segno = sec_end_segno; } sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), end_segno - segno, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { sum_page = f2fs_get_sum_page(sbi, segno++); if (IS_ERR(sum_page)) { int err = PTR_ERR(sum_page); end_segno = segno - 1; for (segno = start_segno; segno < end_segno; segno++) { sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); f2fs_put_page(sum_page, 0); } return err; } unlock_page(sum_page); } blk_start_plug(&plug); for (segno = start_segno; segno < end_segno; segno++) { /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && migrated >= sbi->migration_granularity) goto skip; if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto skip; sum = page_address(sum_page); if (type != GET_SUM_TYPE((&sum->footer))) { f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } /* * this is to avoid deadlock: * - lock_page(sum_page) - f2fs_replace_block * - check_valid_map() - down_write(sentry_lock) * - down_read(sentry_lock) - change_curseg() * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) submitted += gc_node_segment(sbi, sum->entries, segno, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type, force_migrate); stat_inc_gc_seg_count(sbi, data_type, gc_type); sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; if (__is_large_section(sbi)) sbi->next_victim_seg[gc_type] = (segno + 1 < sec_end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } if (submitted) f2fs_submit_merged_write(sbi, data_type); blk_finish_plug(&plug); if (migrated) stat_inc_gc_sec_count(sbi, data_type, gc_type); return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { int gc_type = gc_control->init_gc_type; unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0, total_sec_freed = 0; int ret = 0; struct cp_control cpc; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; unsigned int upper_secs; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), free_sections(sbi), free_segments(sbi), reserved_segments(sbi), prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); gc_more: sbi->skipped_gc_rwsem = 0; if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; } /* Let's run FG_GC, if we don't have enough space. */ if (has_not_enough_free_secs(sbi, 0, 0)) { gc_type = FG_GC; /* * For example, if there are many prefree_segments below given * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; /* Reset due to checkpoint */ sec_freed = 0; } } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } retry: ret = __get_victim(sbi, &segno, gc_type, gc_control->one_time); if (ret) { /* allow to search victim from sections has pinned data */ if (ret == -ENODATA && gc_type == FG_GC && f2fs_pinned_section_exists(DIRTY_I(sbi))) { f2fs_unpin_all_sections(sbi, false); goto retry; } goto stop; } seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, gc_control->should_migrate_blocks, gc_control->one_time); if (seg_freed < 0) goto stop; total_freed += seg_freed; if (seg_freed == f2fs_usable_segs_in_sec(sbi)) { sec_freed++; total_sec_freed++; } if (gc_control->one_time) goto stop; if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; if (has_enough_free_secs(sbi, sec_freed, 0)) { if (!gc_control->no_bg_gc && total_sec_freed < gc_control->nr_free_secs) goto go_gc_more; goto stop; } if (sbi->skipped_gc_rwsem) skipped_round++; round++; if (skipped_round > MAX_SKIP_GC_COUNT && skipped_round * 2 >= round) { stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } } else if (has_enough_free_secs(sbi, 0, 0)) { goto stop; } __get_secs_required(sbi, NULL, &upper_secs, NULL); /* * Write checkpoint to reclaim prefree segments. * We need more three extra sections for writer's data/node/dentry. */ if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && prefree_segments(sbi)) { stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; /* Reset due to checkpoint */ sec_freed = 0; } go_gc_more: segno = NULL_SEGNO; goto gc_more; stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, total_sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), free_sections(sbi), free_segments(sbi), reserved_segments(sbi), prefree_segments(sbi)); f2fs_up_write(&sbi->gc_lock); put_gc_inode(&gc_list); if (gc_control->err_gc_skipped && !ret) ret = total_sec_freed ? 0 : -EAGAIN; return ret; } int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) { kmem_cache_destroy(victim_entry_slab); } static void init_atgc_management(struct f2fs_sb_info *sbi) { struct atgc_management *am = &sbi->am; if (test_opt(sbi, ATGC) && SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) am->atgc_enabled = true; am->root = RB_ROOT_CACHED; INIT_LIST_HEAD(&am->victim_list); am->victim_count = 0; am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; } void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; init_atgc_management(sbi); } int f2fs_gc_range(struct f2fs_sb_info *sbi, unsigned int start_seg, unsigned int end_seg, bool dry_run, unsigned int dry_run_sections) { unsigned int segno; unsigned int gc_secs = dry_run_sections; if (unlikely(f2fs_cp_error(sbi))) return -EIO; for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); put_gc_inode(&gc_list); if (!dry_run && get_valid_blocks(sbi, segno, true)) return -EAGAIN; if (dry_run && dry_run_sections && !get_valid_blocks(sbi, segno, true) && --gc_secs == 0) break; if (fatal_signal_pending(current)) return -ERESTARTSYS; } return 0; } static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int secs, bool dry_run) { unsigned int next_inuse, start, end; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; int gc_mode, gc_type; int err = 0; int type; /* Force block allocation for GC */ MAIN_SECS(sbi) -= secs; start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); end = MAIN_SEGS(sbi) - 1; mutex_lock(&DIRTY_I(sbi)->seglist_lock); for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) if (SIT_I(sbi)->last_victim[gc_mode] >= start) SIT_I(sbi)->last_victim[gc_mode] = 0; for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) if (sbi->next_victim_seg[gc_type] >= start) sbi->next_victim_seg[gc_type] = NULL_SEGNO; mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) { err = f2fs_allocate_segment_for_resize(sbi, type, start, end); if (err) goto out; } /* do GC to move out valid blocks in the range */ err = f2fs_gc_range(sbi, start, end, dry_run, 0); if (err || dry_run) goto out; stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out; next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); if (next_inuse <= end) { f2fs_err(sbi, "segno %u should be free but still inuse!", next_inuse); f2fs_bug_on(sbi, 1); } out: MAIN_SECS(sbi) += secs; return err; } static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) { struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); int section_count; int segment_count; int segment_count_main; long long block_count; int segs = secs * SEGS_PER_SEC(sbi); f2fs_down_write(&sbi->sb_lock); section_count = le32_to_cpu(raw_sb->section_count); segment_count = le32_to_cpu(raw_sb->segment_count); segment_count_main = le32_to_cpu(raw_sb->segment_count_main); block_count = le64_to_cpu(raw_sb->block_count); raw_sb->section_count = cpu_to_le32(section_count + secs); raw_sb->segment_count = cpu_to_le32(segment_count + segs); raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); raw_sb->block_count = cpu_to_le64(block_count + (long long)SEGS_TO_BLKS(sbi, segs)); if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; int dev_segs = le32_to_cpu(raw_sb->devs[last_dev].total_segments); raw_sb->devs[last_dev].total_segments = cpu_to_le32(dev_segs + segs); } f2fs_up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) { int segs = secs * SEGS_PER_SEC(sbi); long long blks = SEGS_TO_BLKS(sbi, segs); long long user_block_count = le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; MAIN_SECS(sbi) += secs; FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; FDEV(last_dev).total_segments = (int)FDEV(last_dev).total_segments + segs; FDEV(last_dev).end_blk = (long long)FDEV(last_dev).end_blk + blks; #ifdef CONFIG_BLK_DEV_ZONED FDEV(last_dev).nr_blkz = FDEV(last_dev).nr_blkz + div_u64(blks, sbi->blocks_per_blkz); #endif } } int f2fs_resize_fs(struct file *filp, __u64 block_count) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; unsigned int secs; int err = 0; __u32 rem; old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count); if (block_count > old_block_count) return -EINVAL; if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; __u64 last_segs = FDEV(last_dev).total_segments; if (block_count + SEGS_TO_BLKS(sbi, last_segs) <= old_block_count) return -EINVAL; } /* new fs size should align to section size */ div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); if (rem) return -EINVAL; if (block_count == old_block_count) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_err(sbi, "Should run fsck to repair first."); return -EFSCORRUPTED; } if (test_opt(sbi, DISABLE_CHECKPOINT)) { f2fs_err(sbi, "Checkpoint should be enabled."); return -EINVAL; } err = mnt_want_write_file(filp); if (err) return err; shrunk_blocks = old_block_count - block_count; secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ if (!f2fs_down_write_trylock(&sbi->gc_lock)) { err = -EAGAIN; goto out_drop_write; } /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + sbi->current_reserved_blocks + sbi->unusable_block_count + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) err = -ENOSPC; spin_unlock(&sbi->stat_lock); if (err) goto out_unlock; err = free_segment_range(sbi, secs, true); out_unlock: f2fs_unlock_op(sbi); f2fs_up_write(&sbi->gc_lock); out_drop_write: mnt_drop_write_file(filp); if (err) return err; err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE); if (err) return err; if (f2fs_readonly(sbi->sb)) { err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); if (err) return err; return -EROFS; } f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + sbi->current_reserved_blocks + sbi->unusable_block_count + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) err = -ENOSPC; else sbi->user_block_count -= shrunk_blocks; spin_unlock(&sbi->stat_lock); if (err) goto out_err; set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; update_sb_metadata(sbi, -secs); err = f2fs_commit_super(sbi, false); if (err) { update_sb_metadata(sbi, secs); goto recover_out; } update_fs_metadata(sbi, -secs); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); set_sbi_flag(sbi, SBI_IS_DIRTY); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) { update_fs_metadata(sbi, secs); update_sb_metadata(sbi, secs); f2fs_commit_super(sbi, false); } recover_out: clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); spin_lock(&sbi->stat_lock); sbi->user_block_count += shrunk_blocks; spin_unlock(&sbi->stat_lock); } out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); return err; }
436 101 424 118 437 1367 603 1322 1321 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-only /* * x86-optimized CRC32 functions * * Copyright (C) 2008 Intel Corporation * Copyright 2012 Xyratex Technology Limited * Copyright 2024 Google LLC */ #include <asm/cpufeatures.h> #include <asm/simd.h> #include <crypto/internal/simd.h> #include <linux/crc32.h> #include <linux/linkage.h> #include <linux/module.h> /* minimum size of buffer for crc32_pclmul_le_16 */ #define CRC32_PCLMUL_MIN_LEN 64 static DEFINE_STATIC_KEY_FALSE(have_crc32); static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { if (len >= CRC32_PCLMUL_MIN_LEN + 15 && static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { size_t n = -(uintptr_t)p & 15; /* align p to 16-byte boundary */ if (n) { crc = crc32_le_base(crc, p, n); p += n; len -= n; } n = round_down(len, 16); kernel_fpu_begin(); crc = crc32_pclmul_le_16(crc, p, n); kernel_fpu_end(); p += n; len -= n; } if (len) crc = crc32_le_base(crc, p, len); return crc; } EXPORT_SYMBOL(crc32_le_arch); #ifdef CONFIG_X86_64 #define CRC32_INST "crc32q %1, %q0" #else #define CRC32_INST "crc32l %1, %0" #endif /* * Use carryless multiply version of crc32c when buffer size is >= 512 to * account for FPU state save/restore overhead. */ #define CRC32C_PCLMUL_BREAKEVEN 512 asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len) { size_t num_longs; if (!static_branch_likely(&have_crc32)) return crc32c_le_base(crc, p, len); if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { kernel_fpu_begin(); crc = crc32c_x86_3way(crc, p, len); kernel_fpu_end(); return crc; } for (num_longs = len / sizeof(unsigned long); num_longs != 0; num_longs--, p += sizeof(unsigned long)) asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p)); for (len %= sizeof(unsigned long); len; len--, p++) asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p)); return crc; } EXPORT_SYMBOL(crc32c_le_arch); u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) { return crc32_be_base(crc, p, len); } EXPORT_SYMBOL(crc32_be_arch); static int __init crc32_x86_init(void) { if (boot_cpu_has(X86_FEATURE_XMM4_2)) static_branch_enable(&have_crc32); if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) static_branch_enable(&have_pclmulqdq); return 0; } arch_initcall(crc32_x86_init); static void __exit crc32_x86_exit(void) { } module_exit(crc32_x86_exit); u32 crc32_optimizations(void) { u32 optimizations = 0; if (static_key_enabled(&have_crc32)) optimizations |= CRC32C_OPTIMIZATION; if (static_key_enabled(&have_pclmulqdq)) optimizations |= CRC32_LE_OPTIMIZATION; return optimizations; } EXPORT_SYMBOL(crc32_optimizations); MODULE_DESCRIPTION("x86-optimized CRC32 functions"); MODULE_LICENSE("GPL");
2 2 3 2 2 5 1 1 1 2 1 2 2 1 4 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 4 3 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 // SPDX-License-Identifier: GPL-2.0 /* * Tty port functions */ #include <linux/types.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/serdev.h> #include "tty.h" static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return 0; ld = tty_ldisc_ref(tty); if (!ld) return 0; count = tty_ldisc_receive_buf(ld, p, f, count); tty_ldisc_deref(ld); return count; } static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return; ld = tty_ldisc_ref(tty); if (!ld) return; if (ld->ops->lookahead_buf) ld->ops->lookahead_buf(ld->tty, p, f, count); tty_ldisc_deref(ld); } static void tty_port_default_wakeup(struct tty_port *port) { struct tty_struct *tty = tty_port_tty_get(port); if (tty) { tty_wakeup(tty); tty_kref_put(tty); } } const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .lookahead_buf = tty_port_default_lookahead_buf, .write_wakeup = tty_port_default_wakeup, }; EXPORT_SYMBOL_GPL(tty_port_default_client_ops); /** * tty_port_init - initialize tty_port * @port: tty_port to initialize * * Initializes the state of struct tty_port. When a port was initialized using * this function, one has to destroy the port by tty_port_destroy(). Either * indirectly by using &tty_port refcounting (tty_port_put()) or directly if * refcounting is not used. */ void tty_port_init(struct tty_port *port) { memset(port, 0, sizeof(*port)); tty_buffer_init(port); init_waitqueue_head(&port->open_wait); init_waitqueue_head(&port->delta_msr_wait); mutex_init(&port->mutex); mutex_init(&port->buf_mutex); spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * Provide the tty layer with a link from a tty (specified by @index) to a * tty_port (@port). Use this only if neither tty_port_register_device() nor * tty_port_install() is used in the driver. If used, this has to be called * before tty_register_driver(). */ void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { if (WARN_ON(index >= driver->num)) return; driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); /** * tty_port_register_device - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * * It is the same as tty_register_device() except the provided @port is linked * to a concrete tty specified by @index. Use this or tty_port_install() (or * both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device) { return tty_port_register_device_attr(port, driver, index, device, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device); /** * tty_port_register_device_attr - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * It is the same as tty_register_device_attr() except the provided @port is * linked to a concrete tty specified by @index. Use this or tty_port_install() * (or both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); /** * tty_port_register_device_attr_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware device * @parent: parent if exists, otherwise NULL * @drvdata: driver data for the device * @attr_grp: attribute group for the device * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp) { struct device *dev; tty_port_link_device(port, driver, index); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { /* Skip creating cdev if we registered a serdev device */ return dev; } return tty_register_device_attr(driver, index, parent, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); /** * tty_port_register_device_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware controller device * @parent: parent if exists, otherwise NULL * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent) { return tty_port_register_device_attr_serdev(port, driver, index, host, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); /** * tty_port_unregister_device - deregister a tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * If a tty or serdev device is registered with a call to * tty_port_register_device_serdev() then this function must be called when * the device is gone. */ void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { int ret; ret = serdev_tty_port_unregister(port); if (ret == 0) return; tty_unregister_device(driver, index); } EXPORT_SYMBOL_GPL(tty_port_unregister_device); int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ mutex_lock(&port->buf_mutex); if (port->xmit_buf == NULL) { port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL); if (port->xmit_buf) kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE); } mutex_unlock(&port->buf_mutex); if (port->xmit_buf == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(tty_port_alloc_xmit_buf); void tty_port_free_xmit_buf(struct tty_port *port) { mutex_lock(&port->buf_mutex); free_page((unsigned long)port->xmit_buf); port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); mutex_unlock(&port->buf_mutex); } EXPORT_SYMBOL(tty_port_free_xmit_buf); /** * tty_port_destroy - destroy inited port * @port: tty port to be destroyed * * When a port was initialized using tty_port_init(), one has to destroy the * port by this function. Either indirectly by using &tty_port refcounting * (tty_port_put()) or directly if refcounting is not used. */ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); } EXPORT_SYMBOL(tty_port_destroy); static void tty_port_destructor(struct kref *kref) { struct tty_port *port = container_of(kref, struct tty_port, kref); /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); if (port->ops && port->ops->destruct) port->ops->destruct(port); else kfree(port); } /** * tty_port_put - drop a reference to tty_port * @port: port to drop a reference of (can be NULL) * * The final put will destroy and free up the @port using * @port->ops->destruct() hook, or using kfree() if not provided. */ void tty_port_put(struct tty_port *port) { if (port) kref_put(&port->kref, tty_port_destructor); } EXPORT_SYMBOL(tty_port_put); /** * tty_port_tty_get - get a tty reference * @port: tty port * * Return a refcount protected tty instance or %NULL if the port is not * associated with a tty (eg due to close or hangup). */ struct tty_struct *tty_port_tty_get(struct tty_port *port) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&port->lock, flags); tty = tty_kref_get(port->tty); spin_unlock_irqrestore(&port->lock, flags); return tty; } EXPORT_SYMBOL(tty_port_tty_get); /** * tty_port_tty_set - set the tty of a port * @port: tty port * @tty: the tty * * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL * to deassociate a port. */ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&port->lock, flags); tty_kref_put(port->tty); port->tty = tty_kref_get(tty); spin_unlock_irqrestore(&port->lock, flags); } EXPORT_SYMBOL(tty_port_tty_set); /** * tty_port_shutdown - internal helper to shutdown the device * @port: tty port to be shut down * @tty: the associated tty * * It is used by tty_port_hangup() and tty_port_close(). Its task is to * shutdown the device if it was initialized (note consoles remain * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes * @port->ops->shutdown(). */ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty) { mutex_lock(&port->mutex); if (port->console) goto out; if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Drop DTR/RTS if HUPCL is set. This causes any attached * modem to hang up the line. */ if (tty && C_HUPCL(tty)) tty_port_lower_dtr_rts(port); if (port->ops->shutdown) port->ops->shutdown(port); } out: mutex_unlock(&port->mutex); } /** * tty_port_hangup - hangup helper * @port: tty port * * Perform port level tty hangup flag and count changes. Drop the tty * reference. * * Caller holds tty lock. */ void tty_port_hangup(struct tty_port *port) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->count = 0; tty = port->tty; if (tty) set_bit(TTY_IO_ERROR, &tty->flags); port->tty = NULL; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_shutdown(port, tty); tty_kref_put(tty); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } EXPORT_SYMBOL(tty_port_hangup); /** * tty_port_tty_hangup - helper to hang up a tty * @port: tty port * @check_clocal: hang only ttys with %CLOCAL unset? */ void tty_port_tty_hangup(struct tty_port *port, bool check_clocal) { struct tty_struct *tty = tty_port_tty_get(port); if (tty && (!check_clocal || !C_CLOCAL(tty))) tty_hangup(tty); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(tty_port_tty_hangup); /** * tty_port_tty_wakeup - helper to wake up a tty * @port: tty port */ void tty_port_tty_wakeup(struct tty_port *port) { port->client_ops->write_wakeup(port); } EXPORT_SYMBOL_GPL(tty_port_tty_wakeup); /** * tty_port_carrier_raised - carrier raised check * @port: tty port * * Wrapper for the carrier detect logic. For the moment this is used * to hide some internal details. This will eventually become entirely * internal to the tty port. */ bool tty_port_carrier_raised(struct tty_port *port) { if (port->ops->carrier_raised == NULL) return true; return port->ops->carrier_raised(port); } EXPORT_SYMBOL(tty_port_carrier_raised); /** * tty_port_raise_dtr_rts - Raise DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_raise_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, true); } EXPORT_SYMBOL(tty_port_raise_dtr_rts); /** * tty_port_lower_dtr_rts - Lower DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_lower_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, false); } EXPORT_SYMBOL(tty_port_lower_dtr_rts); /** * tty_port_block_til_ready - Waiting logic for tty open * @port: the tty port being opened * @tty: the tty device being bound * @filp: the file pointer of the opener or %NULL * * Implement the core POSIX/SuS tty behaviour when opening a tty device. * Handles: * * - hangup (both before and during) * - non blocking open * - rts/dtr/dcd * - signals * - port flags and counts * * The passed @port must implement the @port->ops->carrier_raised method if it * can do carrier detect and the @port->ops->dtr_rts method if it supports * software management of these lines. Note that the dtr/rts raise is done each * iteration as a hangup may have previously dropped them while we wait. * * Caller holds tty lock. * * Note: May drop and reacquire tty lock when blocking, so @tty and @port may * have changed state (eg., may have been hung up). */ int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp) { int do_clocal = 0, retval; unsigned long flags; DEFINE_WAIT(wait); /* if non-blocking mode is set we can pass directly to open unless * the port has just hung up or is in another error state. */ if (tty_io_error(tty)) { tty_port_set_active(port, true); return 0; } if (filp == NULL || (filp->f_flags & O_NONBLOCK)) { /* Indicate we are open */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); tty_port_set_active(port, true); return 0; } if (C_CLOCAL(tty)) do_clocal = 1; /* Block waiting until we can proceed. We may need to wait for the * carrier, but we must also wait for any close that is in progress * before the next open may complete. */ retval = 0; /* The port lock protects the port counts */ spin_lock_irqsave(&port->lock, flags); port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); while (1) { /* Indicate we are open */ if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE); /* Check for a hangup or uninitialised port. * Return accordingly. */ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { if (port->flags & ASYNC_HUP_NOTIFY) retval = -EAGAIN; else retval = -ERESTARTSYS; break; } /* * Probe the carrier. For devices with no carrier detect * tty_port_carrier_raised will always return true. * Never ask drivers if CLOCAL is set, this causes troubles * on some hardware. */ if (do_clocal || tty_port_carrier_raised(port)) break; if (signal_pending(current)) { retval = -ERESTARTSYS; break; } tty_unlock(tty); schedule(); tty_lock(tty); } finish_wait(&port->open_wait, &wait); /* Update counts. A parallel hangup will have set count to zero and * we must not mess that up further. */ spin_lock_irqsave(&port->lock, flags); if (!tty_hung_up_p(filp)) port->count++; port->blocked_open--; spin_unlock_irqrestore(&port->lock, flags); if (retval == 0) tty_port_set_active(port, true); return retval; } EXPORT_SYMBOL(tty_port_block_til_ready); static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty) { unsigned int bps = tty_get_baud_rate(tty); long timeout; if (bps > 1200) { timeout = (HZ * 10 * port->drain_delay) / bps; timeout = max_t(long, timeout, HZ / 10); } else { timeout = 2 * HZ; } schedule_timeout_interruptible(timeout); } /** * tty_port_close_start - helper for tty->ops->close, part 1/2 * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * Decrements and checks open count. Flushes the port if this is the last * close. That means, dropping the data from the outpu buffer on the device and * waiting for sending logic to finish. The rest of close handling is performed * in tty_port_close_end(). * * Locking: Caller holds tty lock. * * Return: 1 if this is the last close, otherwise 0 */ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp) { unsigned long flags; if (tty_hung_up_p(filp)) return 0; spin_lock_irqsave(&port->lock, flags); if (tty->count == 1 && port->count != 1) { tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__, port->count); port->count = 1; } if (--port->count < 0) { tty_warn(tty, "%s: bad port count (%d)\n", __func__, port->count); port->count = 0; } if (port->count) { spin_unlock_irqrestore(&port->lock, flags); return 0; } spin_unlock_irqrestore(&port->lock, flags); tty->closing = 1; if (tty_port_initialized(port)) { /* Don't block on a stalled port, just pull the chain */ if (tty->flow.tco_stopped) tty_driver_flush_buffer(tty); if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, port->closing_wait); if (port->drain_delay) tty_port_drain_delay(port, tty); } /* Flush the ldisc buffering */ tty_ldisc_flush(tty); /* Report to caller this is the last port reference */ return 1; } EXPORT_SYMBOL(tty_port_close_start); /** * tty_port_close_end - helper for tty->ops->close, part 2/2 * @port: tty_port of the device * @tty: tty being closed * * This is a continuation of the first part: tty_port_close_start(). This * should be called after turning off the device. It flushes the data from the * line discipline and delays the close by @port->close_delay. * * Locking: Caller holds tty lock. */ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; tty_ldisc_flush(tty); tty->closing = 0; spin_lock_irqsave(&port->lock, flags); if (port->blocked_open) { spin_unlock_irqrestore(&port->lock, flags); if (port->close_delay) msleep_interruptible(jiffies_to_msecs(port->close_delay)); spin_lock_irqsave(&port->lock, flags); wake_up_interruptible(&port->open_wait); } spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); } EXPORT_SYMBOL(tty_port_close_end); /** * tty_port_close - generic tty->ops->close handler * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->close. It wraps a * sequence of tty_port_close_start(), tty_port_shutdown(), and * tty_port_close_end(). The latter two are called only if this is the last * close. See the respective functions for the details. * * Locking: Caller holds tty lock */ void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp) { if (tty_port_close_start(port, tty, filp) == 0) return; tty_port_shutdown(port, tty); if (!port->console) set_bit(TTY_IO_ERROR, &tty->flags); tty_port_close_end(port, tty); tty_port_tty_set(port, NULL); } EXPORT_SYMBOL(tty_port_close); /** * tty_port_install - generic tty->ops->install handler * @port: tty_port of the device * @driver: tty_driver for this device * @tty: tty to be installed * * It is the same as tty_standard_install() except the provided @port is linked * to a concrete tty specified by @tty. Use this or tty_port_register_device() * (or both). Call tty_port_link_device() as a last resort. */ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); /** * tty_port_open - generic tty->ops->open handler * @port: tty_port of the device * @tty: tty to be opened * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->open. It activates * the devices using @port->ops->activate if not active already. And waits for * the device to be ready using tty_port_block_til_ready() (e.g. raises * DTR/CTS and waits for carrier). * * Note that @port->ops->shutdown is not called when @port->ops->activate * returns an error (on the contrary, @tty->ops->close is). * * Locking: Caller holds tty lock. * * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so * @tty and @port may have changed state (eg., may be hung up now). */ int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp) { spin_lock_irq(&port->lock); ++port->count; spin_unlock_irq(&port->lock); tty_port_tty_set(port, tty); /* * Do the device-specific open only if the hardware isn't * already initialized. Serialize open and shutdown using the * port mutex. */ mutex_lock(&port->mutex); if (!tty_port_initialized(port)) { clear_bit(TTY_IO_ERROR, &tty->flags); if (port->ops->activate) { int retval = port->ops->activate(port, tty); if (retval) { mutex_unlock(&port->mutex); return retval; } } tty_port_set_initialized(port, true); } mutex_unlock(&port->mutex); return tty_port_block_til_ready(port, tty, filp); } EXPORT_SYMBOL(tty_port_open);
3 3 3 3 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include "fib_lookup.h" #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) #define DEFAULT_MIN_ADVMSS 256 static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; struct neighbour *n; rcu_read_lock(); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ static u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, int flow_flags) { __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_aligned_key_t fnhe_hash_key; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); if ((dst->obsolete > 0) || (rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_unlock; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_unlock: rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; SKB_DR(reason); bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: SKB_DR_SET(reason, IP_INADDRERRORS); __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } rcu_read_unlock(); if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb_reason(skb, reason); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) return; rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } rcu_read_unlock(); return; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct net_device *dev; struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&opt, 0, sizeof(opt)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { ip_dst_metrics_put(dst); rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt, *safe; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) return SKB_DROP_REASON_NOT_SPECIFIED; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) return SKB_DROP_REASON_IP_INVALID_SOURCE; if (skb->protocol != htons(ETH_P_IP)) return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) return reason; } return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ static enum skb_drop_reason ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (reason) return reason; if (our) flags |= RTCF_LOCAL; if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return SKB_NOT_DROPPED_YET; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static enum skb_drop_reason __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { reason = SKB_DROP_REASON_NOMEM; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: reason = SKB_NOT_DROPPED_YET; cleanup: return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = fl4->fl4_sport; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static enum skb_drop_reason ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); u32 tag = 0; if (!in_dev) return reason; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &tag); if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return reason; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static enum skb_drop_reason ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(daddr)) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; } } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } err = -EINVAL; if (res->type == RTN_LOCAL) { reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } make_route: reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return reason; brd_input: if (skb->protocol != htons(ETH_P_IP)) { reason = SKB_DROP_REASON_INVALID_PROTO; goto out; } if (!ipv4_is_zeronet(saddr)) { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } /* called with rcu_read_lock held */ static enum skb_drop_reason ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; if (!in_dev) return reason; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } return reason; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; rcu_read_lock(); reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return reason; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl4->daddr)) return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || ipv4_is_zeronet(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0)); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (rt->dst.lwtstate && lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = rt->dst.expires; if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; dscp_t dscp; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = nla_get_in_addr_default(tb[RTA_SRC], 0); dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, dscp, dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_dscp == fri.dscp && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv4.ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/auth.h> #include <linux/ceph/pagelist.h> #include <linux/ceph/striper.h> #define OSD_OPREPLY_FRONT_LEN 512 static struct kmem_cache *ceph_osd_request_cache; static const struct ceph_connection_operations osd_con_ops; /* * Implement client access to distributed object storage cluster. * * All data objects are stored within a cluster/cloud of OSDs, or * "object storage devices." (Note that Ceph OSDs have _nothing_ to * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply * remote daemons serving up and coordinating consistent and safe * access to storage. * * Cluster membership and the mapping of data objects onto storage devices * are described by the osd map. * * We keep track of pending OSD requests (read, write), resubmit * requests to different OSDs when the cluster topology/data layout * change, or retry the affected requests when the communications * channel with an OSD is reset. */ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void clear_backoffs(struct ceph_osd *osd); #if 1 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) { bool wrlocked = true; if (unlikely(down_read_trylock(sem))) { wrlocked = false; up_read(sem); } return wrlocked; } static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { WARN_ON(!rwsem_is_locked(&osdc->lock)); } static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { WARN_ON(!rwsem_is_wrlocked(&osdc->lock)); } static inline void verify_osd_locked(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; WARN_ON(!(mutex_is_locked(&osd->lock) && rwsem_is_locked(&osdc->lock)) && !rwsem_is_wrlocked(&osdc->lock)); } static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { WARN_ON(!mutex_is_locked(&lreq->lock)); } #else static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } static inline void verify_osd_locked(struct ceph_osd *osd) { } static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { } #endif /* * calculate the mapping of a file extent onto an object, and fill out the * request accordingly. shorten extent as necessary if it crosses an * object boundary. * * fill osd op in request message. */ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; u32 xlen; /* object extent? */ ceph_calc_file_object_mapping(layout, off, orig_len, objnum, objoff, &xlen); *objlen = xlen; if (*objlen < orig_len) { *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); } dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); return 0; } static void ceph_osd_data_init(struct ceph_osd_data *osd_data) { memset(osd_data, 0, sizeof (*osd_data)); osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } /* * Consumes @pages if @own_pages is true. */ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; osd_data->length = length; osd_data->alignment = alignment; osd_data->pages_from_pool = pages_from_pool; osd_data->own_pages = own_pages; } /* * Consumes a ref on @pagelist. */ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; osd_data->pagelist = pagelist; } #ifdef CONFIG_BLOCK static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct ceph_bio_iter *bio_pos, u32 bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; osd_data->bio_pos = *bio_pos; osd_data->bio_length = bio_length; } #endif /* CONFIG_BLOCK */ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, struct ceph_bvec_iter *bvec_pos, u32 num_bvecs) { osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; osd_data->bvec_pos = *bvec_pos; osd_data->num_bvecs = num_bvecs; } static void ceph_osd_iter_init(struct ceph_osd_data *osd_data, struct iov_iter *iter) { osd_data->type = CEPH_OSD_DATA_TYPE_ITER; osd_data->iter = *iter; } static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) { BUG_ON(which >= osd_req->r_num_ops); return &osd_req->r_ops[which].raw_data_in; } struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, unsigned int which) { return osd_req_op_data(osd_req, which, extent, osd_data); } EXPORT_SYMBOL(osd_req_op_extent_osd_data); void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_raw_data_in(osd_req, which); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_pagelist_init(osd_data, pagelist); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bio_iter *bio_pos, u32 bio_length) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bio_init(osd_data, bio_pos, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, struct bio_vec *bvecs, u32 num_bvecs, u32 bytes) { struct ceph_osd_data *osd_data; struct ceph_bvec_iter it = { .bvecs = bvecs, .iter = { .bi_size = bytes }, }; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs); void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bvec_iter *bvec_pos) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); /** * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer * @osd_req: The request to set up * @which: Index of the operation in which to set the iter * @iter: The buffer iterator */ void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, unsigned int which, struct iov_iter *iter) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); ceph_osd_iter_init(osd_data, iter); } EXPORT_SYMBOL(osd_req_op_extent_osd_iter); static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, request_info); ceph_osd_data_pagelist_init(osd_data, pagelist); } void osd_req_op_cls_request_data_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pagelist_init(osd_data, pagelist); osd_req->r_ops[which].cls.indata_len += pagelist->length; osd_req->r_ops[which].indata_len += pagelist->length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); osd_req->r_ops[which].cls.indata_len += length; osd_req->r_ops[which].indata_len += length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, struct bio_vec *bvecs, u32 num_bvecs, u32 bytes) { struct ceph_osd_data *osd_data; struct ceph_bvec_iter it = { .bvecs = bvecs, .iter = { .bi_size = bytes }, }; osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); osd_req->r_ops[which].cls.indata_len += bytes; osd_req->r_ops[which].indata_len += bytes; } EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs); void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, cls, response_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) { switch (osd_data->type) { case CEPH_OSD_DATA_TYPE_NONE: return 0; case CEPH_OSD_DATA_TYPE_PAGES: return osd_data->length; case CEPH_OSD_DATA_TYPE_PAGELIST: return (u64)osd_data->pagelist->length; #ifdef CONFIG_BLOCK case CEPH_OSD_DATA_TYPE_BIO: return (u64)osd_data->bio_length; #endif /* CONFIG_BLOCK */ case CEPH_OSD_DATA_TYPE_BVECS: return osd_data->bvec_pos.iter.bi_size; case CEPH_OSD_DATA_TYPE_ITER: return iov_iter_count(&osd_data->iter); default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; } } static void ceph_osd_data_release(struct ceph_osd_data *osd_data) { if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { int num_pages; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); ceph_release_page_vector(osd_data->pages, num_pages); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { ceph_pagelist_release(osd_data->pagelist); } ceph_osd_data_init(osd_data); } static void osd_req_op_data_release(struct ceph_osd_request *osd_req, unsigned int which) { struct ceph_osd_req_op *op; BUG_ON(which >= osd_req->r_num_ops); op = &osd_req->r_ops[which]; switch (op->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: kfree(op->extent.sparse_ext); ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: ceph_osd_data_release(&op->cls.request_info); ceph_osd_data_release(&op->cls.request_data); ceph_osd_data_release(&op->cls.response_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: ceph_osd_data_release(&op->xattr.osd_data); break; case CEPH_OSD_OP_STAT: ceph_osd_data_release(&op->raw_data_in); break; case CEPH_OSD_OP_NOTIFY_ACK: ceph_osd_data_release(&op->notify_ack.request_data); break; case CEPH_OSD_OP_NOTIFY: ceph_osd_data_release(&op->notify.request_data); ceph_osd_data_release(&op->notify.response_data); break; case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; case CEPH_OSD_OP_COPY_FROM2: ceph_osd_data_release(&op->copy_from.osd_data); break; default: break; } } /* * Assumes @t is zero-initialized. */ static void target_init(struct ceph_osd_request_target *t) { ceph_oid_init(&t->base_oid); ceph_oloc_init(&t->base_oloc); ceph_oid_init(&t->target_oid); ceph_oloc_init(&t->target_oloc); ceph_osds_init(&t->acting); ceph_osds_init(&t->up); t->size = -1; t->min_size = -1; t->osd = CEPH_HOMELESS_OSD; } static void target_copy(struct ceph_osd_request_target *dest, const struct ceph_osd_request_target *src) { ceph_oid_copy(&dest->base_oid, &src->base_oid); ceph_oloc_copy(&dest->base_oloc, &src->base_oloc); ceph_oid_copy(&dest->target_oid, &src->target_oid); ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); dest->pgid = src->pgid; /* struct */ dest->spgid = src->spgid; /* struct */ dest->pg_num = src->pg_num; dest->pg_num_mask = src->pg_num_mask; ceph_osds_copy(&dest->acting, &src->acting); ceph_osds_copy(&dest->up, &src->up); dest->size = src->size; dest->min_size = src->min_size; dest->sort_bitwise = src->sort_bitwise; dest->recovery_deletes = src->recovery_deletes; dest->flags = src->flags; dest->used_replica = src->used_replica; dest->paused = src->paused; dest->epoch = src->epoch; dest->last_force_resend = src->last_force_resend; dest->osd = src->osd; } static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); ceph_oloc_destroy(&t->target_oloc); } /* * requests */ static void request_release_checks(struct ceph_osd_request *req) { WARN_ON(!RB_EMPTY_NODE(&req->r_node)); WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); WARN_ON(!list_empty(&req->r_private_item)); WARN_ON(req->r_osd); } static void ceph_osdc_release_request(struct kref *kref) { struct ceph_osd_request *req = container_of(kref, struct ceph_osd_request, r_kref); unsigned int which; dout("%s %p (r_request %p r_reply %p)\n", __func__, req, req->r_request, req->r_reply); request_release_checks(req); if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); target_destroy(&req->r_t); ceph_put_snap_context(req->r_snapc); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) kmem_cache_free(ceph_osd_request_cache, req); else kfree(req); } void ceph_osdc_get_request(struct ceph_osd_request *req) { dout("%s %p (was %d)\n", __func__, req, kref_read(&req->r_kref)); kref_get(&req->r_kref); } EXPORT_SYMBOL(ceph_osdc_get_request); void ceph_osdc_put_request(struct ceph_osd_request *req) { if (req) { dout("%s %p (was %d)\n", __func__, req, kref_read(&req->r_kref)); kref_put(&req->r_kref, ceph_osdc_release_request); } } EXPORT_SYMBOL(ceph_osdc_put_request); static void request_init(struct ceph_osd_request *req) { /* req only, each op is zeroed in osd_req_op_init() */ memset(req, 0, sizeof(*req)); kref_init(&req->r_kref); init_completion(&req->r_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_private_item); target_init(&req->r_t); } struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; if (use_mempool) { BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); req = mempool_alloc(osdc->req_mempool, gfp_flags); } else if (num_ops <= CEPH_OSD_SLAB_OPS) { req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { BUG_ON(num_ops > CEPH_OSD_MAX_OPS); req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags); } if (unlikely(!req)) return NULL; request_init(req); req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; req->r_snapid = CEPH_NOSNAP; req->r_snapc = ceph_get_snap_context(snapc); dout("%s req %p\n", __func__, req); return req; } EXPORT_SYMBOL(ceph_osdc_alloc_request); static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) { return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp, int num_request_data_items, int num_reply_data_items) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_msg *msg; int msg_size; WARN_ON(req->r_request || req->r_reply); WARN_ON(ceph_oid_empty(&req->r_base_oid)); WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */ msg_size += CEPH_ENCODING_START_BLK_LEN + sizeof(struct ceph_osd_reqid); /* reqid */ msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */ msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */ msg_size += CEPH_ENCODING_START_BLK_LEN + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size, num_request_data_items); else msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size, num_request_data_items, gfp, true); if (!msg) return -ENOMEM; memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; /* create reply message */ msg_size = OSD_OPREPLY_FRONT_LEN; msg_size += req->r_base_oid.name_len; msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size, num_reply_data_items); else msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size, num_reply_data_items, gfp, true); if (!msg) return -ENOMEM; req->r_reply = msg; return 0; } static bool osd_req_opcode_valid(u16 opcode) { switch (opcode) { #define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true; __CEPH_FORALL_OSD_OPS(GENERATE_CASE) #undef GENERATE_CASE default: return false; } } static void get_num_data_items(struct ceph_osd_request *req, int *num_request_data_items, int *num_reply_data_items) { struct ceph_osd_req_op *op; *num_request_data_items = 0; *num_reply_data_items = 0; for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_COPY_FROM2: *num_request_data_items += 1; break; /* reply */ case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_LIST_WATCHERS: *num_reply_data_items += 1; break; /* both */ case CEPH_OSD_OP_NOTIFY: *num_request_data_items += 1; *num_reply_data_items += 1; break; case CEPH_OSD_OP_CALL: *num_request_data_items += 2; *num_reply_data_items += 1; break; default: WARN_ON(!osd_req_opcode_valid(op->op)); break; } } } /* * oid, oloc and OSD op opcode(s) must be filled in before this function * is called. */ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { int num_request_data_items, num_reply_data_items; get_num_data_items(req, &num_request_data_items, &num_reply_data_items); return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items, num_reply_data_items); } EXPORT_SYMBOL(ceph_osdc_alloc_messages); /* * This is an osd op init function for opcodes that have no data or * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ struct ceph_osd_req_op * osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags) { struct ceph_osd_req_op *op; BUG_ON(which >= osd_req->r_num_ops); BUG_ON(!osd_req_opcode_valid(opcode)); op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); op->op = opcode; op->flags = flags; return op; } EXPORT_SYMBOL(osd_req_op_init); void osd_req_op_extent_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ); op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; op->extent.truncate_seq = truncate_seq; if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_extent_init); void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length) { struct ceph_osd_req_op *op; u64 previous; BUG_ON(which >= osd_req->r_num_ops); op = &osd_req->r_ops[which]; previous = op->extent.length; if (length == previous) return; /* Nothing to do */ BUG_ON(length > previous); op->extent.length = length; if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, unsigned int which, u64 offset_inc) { struct ceph_osd_req_op *op, *prev_op; BUG_ON(which + 1 >= osd_req->r_num_ops); prev_op = &osd_req->r_ops[which]; op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); /* dup previous one */ op->indata_len = prev_op->indata_len; op->outdata_len = prev_op->outdata_len; op->extent = prev_op->extent; /* adjust offset */ op->extent.offset += offset_inc; op->extent.length -= offset_inc; if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) op->indata_len -= offset_inc; } EXPORT_SYMBOL(osd_req_op_extent_dup_last); int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, const char *class, const char *method) { struct ceph_osd_req_op *op; struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; int ret; op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; ret = ceph_pagelist_append(pagelist, class, size); if (ret) goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; ret = ceph_pagelist_append(pagelist, method, size); if (ret) goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->indata_len = payload_len; return 0; err_pagelist_free: ceph_pagelist_release(pagelist); return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; payload_len = strlen(name); op->xattr.name_len = payload_len; ret = ceph_pagelist_append(pagelist, name, payload_len); if (ret) goto err_pagelist_free; op->xattr.value_len = size; ret = ceph_pagelist_append(pagelist, value, size); if (ret) goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; op->xattr.cmp_mode = cmp_mode; ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; err_pagelist_free: ceph_pagelist_release(pagelist); return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); /* * @watch_opcode: CEPH_OSD_WATCH_OP_* */ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, u8 watch_opcode, u64 cookie, u32 gen) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; op->watch.gen = gen; } /* * prot_ver, timeout and notify payload (may be empty) should already be * encoded in @request_pl */ static void osd_req_op_notify_init(struct ceph_osd_request *req, int which, u64 cookie, struct ceph_pagelist *request_pl) { struct ceph_osd_req_op *op; op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl); op->indata_len = request_pl->length; } /* * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, u64 expected_write_size, u32 flags) { struct ceph_osd_req_op *op; op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; op->alloc_hint.flags = flags; /* * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed * not worth a feature bit. Set FAILOK per-op flag to make * sure older osds don't trip over an unsupported opcode. */ op->flags |= CEPH_OSD_OP_FLAG_FAILOK; } EXPORT_SYMBOL(osd_req_op_alloc_hint_init); static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { u64 length = ceph_osd_data_length(osd_data); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { BUG_ON(length > (u64) SIZE_MAX); if (length) ceph_msg_data_add_pages(msg, osd_data->pages, length, osd_data->alignment, false); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { BUG_ON(!length); ceph_msg_data_add_pagelist(msg, osd_data->pagelist); #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); #endif } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) { ceph_msg_data_add_iter(msg, &osd_data->iter); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } } static u32 osd_req_encode_op(struct ceph_osd_op *dst, const struct ceph_osd_req_op *src) { switch (src->op) { case CEPH_OSD_OP_STAT: break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); dst->watch.ver = cpu_to_le64(0); dst->watch.op = src->watch.op; dst->watch.gen = cpu_to_le32(src->watch.gen); break; case CEPH_OSD_OP_NOTIFY_ACK: break; case CEPH_OSD_OP_NOTIFY: dst->notify.cookie = cpu_to_le64(src->notify.cookie); break; case CEPH_OSD_OP_LIST_WATCHERS: break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = cpu_to_le64(src->alloc_hint.expected_object_size); dst->alloc_hint.expected_write_size = cpu_to_le64(src->alloc_hint.expected_write_size); dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); dst->xattr.cmp_op = src->xattr.cmp_op; dst->xattr.cmp_mode = src->xattr.cmp_mode; break; case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; case CEPH_OSD_OP_COPY_FROM2: dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); dst->copy_from.src_version = cpu_to_le64(src->copy_from.src_version); dst->copy_from.flags = src->copy_from.flags; dst->copy_from.src_fadvise_flags = cpu_to_le32(src->copy_from.src_fadvise_flags); break; case CEPH_OSD_OP_ASSERT_VER: dst->assert_ver.unused = cpu_to_le64(0); dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver); break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); WARN_ON(1); return 0; } dst->op = cpu_to_le16(src->op); dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->indata_len); return src->indata_len; } /* * build new request AND message, calculate layout, and adjust file * extent as needed. * * if the file was recently truncated, we include information about its * old and new size so that the object can be updated appropriately. (we * avoid synchronously deleting truncated objects because it's slow.) */ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, u64 off, u64 *plen, unsigned int which, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool) { struct ceph_osd_request *req; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; int r; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_SPARSE_READ); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); if (!req) { r = -ENOMEM; goto fail; } /* calculate max write size */ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); if (r) goto fail; if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { truncate_size = 0; } else { truncate_size -= object_base; if (truncate_size > object_size) truncate_size = object_size; } } osd_req_op_extent_init(req, which, opcode, objoff, objlen, truncate_size, truncate_seq); } req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_flags = flags | osdc->client->options->read_from_replica; req->r_snapid = vino.snap; if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; if (num_ops > 1) { int num_req_ops, num_rep_ops; /* * If this is a multi-op write request, assume that we'll need * request ops. If it's a multi-op read then assume we'll need * reply ops. Anything else and call it -EINVAL. */ if (flags & CEPH_OSD_FLAG_WRITE) { num_req_ops = num_ops; num_rep_ops = 0; } else if (flags & CEPH_OSD_FLAG_READ) { num_req_ops = 0; num_rep_ops = num_ops; } else { r = -EINVAL; goto fail; } r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops, num_rep_ops); } else { r = ceph_osdc_alloc_messages(req, GFP_NOFS); } if (r) goto fail; return req; fail: ceph_osdc_put_request(req); return ERR_PTR(r); } EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), GFP_NOFS); if (!op->extent.sparse_ext) return -ENOMEM; return 0; } EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map); /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) /* * Call @fn on each OSD request as long as @fn returns 0. */ static void for_each_request(struct ceph_osd_client *osdc, int (*fn)(struct ceph_osd_request *req, void *arg), void *arg) { struct rb_node *n, *p; for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); for (p = rb_first(&osd->o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); if (fn(req, arg)) return; } } for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); if (fn(req, arg)) return; } } static bool osd_homeless(struct ceph_osd *osd) { return osd->o_osd == CEPH_HOMELESS_OSD; } static bool osd_registered(struct ceph_osd *osd) { verify_osdc_locked(osd->o_osdc); return !RB_EMPTY_NODE(&osd->o_node); } /* * Assumes @osd is zero-initialized. */ static void osd_init(struct ceph_osd *osd) { refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); spin_lock_init(&osd->o_requests_lock); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; osd->o_backoff_mappings = RB_ROOT; osd->o_backoffs_by_id = RB_ROOT; INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; mutex_init(&osd->lock); } static void ceph_init_sparse_read(struct ceph_sparse_read *sr) { kfree(sr->sr_extent); memset(sr, '\0', sizeof(*sr)); sr->sr_state = CEPH_SPARSE_READ_HDR; } static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); ceph_init_sparse_read(&osd->o_sparse_read); if (osd->o_auth.authorizer) { WARN_ON(osd_homeless(osd)); ceph_auth_destroy_authorizer(osd->o_auth.authorizer); } } /* * Track open sessions with osds. */ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; WARN_ON(onum == CEPH_HOMELESS_OSD); osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; osd->o_sparse_op_idx = -1; ceph_init_sparse_read(&osd->o_sparse_read); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); return osd; } static struct ceph_osd *get_osd(struct ceph_osd *osd) { if (refcount_inc_not_zero(&osd->o_ref)) { dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); return NULL; } } static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), refcount_read(&osd->o_ref) - 1); if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); } } DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) static void __move_osd_to_lru(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); BUG_ON(!list_empty(&osd->o_osd_lru)); spin_lock(&osdc->osd_lru_lock); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); spin_unlock(&osdc->osd_lru_lock); osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } static void maybe_move_osd_to_lru(struct ceph_osd *osd) { if (RB_EMPTY_ROOT(&osd->o_requests) && RB_EMPTY_ROOT(&osd->o_linger_requests)) __move_osd_to_lru(osd); } static void __remove_osd_from_lru(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); spin_lock(&osdc->osd_lru_lock); if (!list_empty(&osd->o_osd_lru)) list_del_init(&osd->o_osd_lru); spin_unlock(&osdc->osd_lru_lock); } /* * Close the connection and assign any leftover requests to the * homeless session. */ static void close_osd(struct ceph_osd *osd) { struct ceph_osd_client *osdc = osd->o_osdc; struct rb_node *n; verify_osdc_wrlocked(osdc); dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); ceph_con_close(&osd->o_con); for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); /* unlink_request() */ dout(" reassigning req %p tid %llu\n", req, req->r_tid); unlink_request(osd, req); link_request(&osdc->homeless_osd, req); } for (n = rb_first(&osd->o_linger_requests); n; ) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); n = rb_next(n); /* unlink_linger() */ dout(" reassigning lreq %p linger_id %llu\n", lreq, lreq->linger_id); unlink_linger(osd, lreq); link_linger(&osdc->homeless_osd, lreq); } clear_backoffs(osd); __remove_osd_from_lru(osd); erase_osd(&osdc->osds, osd); put_osd(osd); } /* * reset osd connect */ static int reopen_osd(struct ceph_osd *osd) { struct ceph_entity_addr *peer_addr; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); if (RB_EMPTY_ROOT(&osd->o_requests) && RB_EMPTY_ROOT(&osd->o_linger_requests)) { close_osd(osd); return -ENODEV; } peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd]; if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && !ceph_con_opened(&osd->o_con)) { struct rb_node *n; dout("osd addr hasn't changed and connection never opened, " "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); req->r_stamp = jiffies; } return -EAGAIN; } ceph_con_close(&osd->o_con); ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); osd->o_incarnation++; return 0; } static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o, bool wrlocked) { struct ceph_osd *osd; if (wrlocked) verify_osdc_wrlocked(osdc); else verify_osdc_locked(osdc); if (o != CEPH_HOMELESS_OSD) osd = lookup_osd(&osdc->osds, o); else osd = &osdc->homeless_osd; if (!osd) { if (!wrlocked) return ERR_PTR(-EAGAIN); osd = create_osd(osdc, o); insert_osd(&osdc->osds, osd); ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, &osdc->osdmap->osd_addr[osd->o_osd]); } dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd); return osd; } /* * Create request <-> OSD session relation. * * @req has to be assigned a tid, @osd may be homeless. */ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) { verify_osd_locked(osd); WARN_ON(!req->r_tid || req->r_osd); dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, req, req->r_tid); if (!osd_homeless(osd)) __remove_osd_from_lru(osd); else atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); spin_lock(&osd->o_requests_lock); insert_request(&osd->o_requests, req); spin_unlock(&osd->o_requests_lock); req->r_osd = osd; } static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) { verify_osd_locked(osd); WARN_ON(req->r_osd != osd); dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, req, req->r_tid); req->r_osd = NULL; spin_lock(&osd->o_requests_lock); erase_request(&osd->o_requests, req); spin_unlock(&osd->o_requests_lock); put_osd(osd); if (!osd_homeless(osd)) maybe_move_osd_to_lru(osd); else atomic_dec(&osd->o_osdc->num_homeless); } static bool __pool_full(struct ceph_pg_pool_info *pi) { return pi->flags & CEPH_POOL_FLAG_FULL; } static bool have_pool_full(struct ceph_osd_client *osdc) { struct rb_node *n; for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); if (__pool_full(pi)) return true; } return false; } static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); if (!pi) return false; return __pool_full(pi); } /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. */ static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, struct ceph_pg_pool_info *pi) { bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); WARN_ON(pi->id != t->target_oloc.pool); return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || (osdc->osdmap->epoch < osdc->epoch_barrier); } static int pick_random_replica(const struct ceph_osds *acting) { int i = get_random_u32_below(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); return i; } /* * Picks the closest replica based on client's location given by * crush_location option. Prefers the primary if the locality is * the same. */ static int pick_closest_replica(struct ceph_osd_client *osdc, const struct ceph_osds *acting) { struct ceph_options *opt = osdc->client->options; int best_i, best_locality; int i = 0, locality; do { locality = ceph_get_crush_locality(osdc->osdmap, acting->osds[i], &opt->crush_locs); if (i == 0 || (locality >= 0 && best_locality < 0) || (locality >= 0 && best_locality >= 0 && locality < best_locality)) { best_i = i; best_locality = locality; } } while (++i < acting->size); dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, acting->osds[best_i], best_locality, acting->primary); return best_i; } enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, CALC_TARGET_POOL_DNE, }; static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, bool any_change) { struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; bool is_read = t->flags & CEPH_OSD_FLAG_READ; bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); bool recovery_deletes = ceph_osdmap_flag(osdc, CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; ct_res = CALC_TARGET_POOL_DNE; goto out; } if (osdc->osdmap->epoch == pi->last_force_request_resend) { if (t->last_force_resend < pi->last_force_request_resend) { t->last_force_resend = pi->last_force_request_resend; force_resend = true; } else if (t->last_force_resend == 0) { force_resend = true; } } /* apply tiering */ ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; ct_res = CALC_TARGET_POOL_DNE; goto out; } } __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid); last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting); if (any_change && ceph_is_new_interval(&t->acting, &acting, &t->up, &up, t->size, pi->size, t->min_size, pi->min_size, t->pg_num, pi->pg_num, t->sort_bitwise, sort_bitwise, t->recovery_deletes, recovery_deletes, &last_pgid)) force_resend = true; if (t->paused && !target_should_be_paused(osdc, t, pi)) { t->paused = false; unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || ceph_osds_changed(&t->acting, &acting, t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); if (legacy_change || force_resend || split) { t->pgid = pgid; /* struct */ ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid); ceph_osds_copy(&t->acting, &acting); ceph_osds_copy(&t->up, &up); t->size = pi->size; t->min_size = pi->min_size; t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS)) && !is_write && pi->type == CEPH_POOL_TYPE_REP && acting.size > 1) { int pos; WARN_ON(!is_read || acting.osds[0] != acting.primary); if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { pos = pick_random_replica(&acting); } else { pos = pick_closest_replica(osdc, &acting); } t->osd = acting.osds[pos]; t->used_replica = pos > 0; } else { t->osd = acting.primary; t->used_replica = false; } } if (unpaused || legacy_change || force_resend || split) ct_res = CALC_TARGET_NEED_RESEND; else ct_res = CALC_TARGET_NO_ACTION; out: dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused, legacy_change, force_resend, split, ct_res, t->osd); return ct_res; } static struct ceph_spg_mapping *alloc_spg_mapping(void) { struct ceph_spg_mapping *spg; spg = kmalloc(sizeof(*spg), GFP_NOIO); if (!spg) return NULL; RB_CLEAR_NODE(&spg->node); spg->backoffs = RB_ROOT; return spg; } static void free_spg_mapping(struct ceph_spg_mapping *spg) { WARN_ON(!RB_EMPTY_NODE(&spg->node)); WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs)); kfree(spg); } /* * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is * defined only within a specific spgid; it does not pass anything to * children on split, or to another primary. */ DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare, RB_BYPTR, const struct ceph_spg *, node) static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid) { return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits; } static void hoid_get_effective_key(const struct ceph_hobject_id *hoid, void **pkey, size_t *pkey_len) { if (hoid->key_len) { *pkey = hoid->key; *pkey_len = hoid->key_len; } else { *pkey = hoid->oid; *pkey_len = hoid->oid_len; } } static int compare_names(const void *name1, size_t name1_len, const void *name2, size_t name2_len) { int ret; ret = memcmp(name1, name2, min(name1_len, name2_len)); if (!ret) { if (name1_len < name2_len) ret = -1; else if (name1_len > name2_len) ret = 1; } return ret; } static int hoid_compare(const struct ceph_hobject_id *lhs, const struct ceph_hobject_id *rhs) { void *effective_key1, *effective_key2; size_t effective_key1_len, effective_key2_len; int ret; if (lhs->is_max < rhs->is_max) return -1; if (lhs->is_max > rhs->is_max) return 1; if (lhs->pool < rhs->pool) return -1; if (lhs->pool > rhs->pool) return 1; if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs)) return -1; if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs)) return 1; ret = compare_names(lhs->nspace, lhs->nspace_len, rhs->nspace, rhs->nspace_len); if (ret) return ret; hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len); hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len); ret = compare_names(effective_key1, effective_key1_len, effective_key2, effective_key2_len); if (ret) return ret; ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len); if (ret) return ret; if (lhs->snapid < rhs->snapid) return -1; if (lhs->snapid > rhs->snapid) return 1; return 0; } /* * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX * compat stuff here. * * Assumes @hoid is zero-initialized. */ static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid) { u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v, &struct_len); if (ret) return ret; if (struct_v < 4) { pr_err("got struct_v %d < 4 of hobject_t\n", struct_v); goto e_inval; } hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len, GFP_NOIO); if (IS_ERR(hoid->key)) { ret = PTR_ERR(hoid->key); hoid->key = NULL; return ret; } hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len, GFP_NOIO); if (IS_ERR(hoid->oid)) { ret = PTR_ERR(hoid->oid); hoid->oid = NULL; return ret; } ceph_decode_64_safe(p, end, hoid->snapid, e_inval); ceph_decode_32_safe(p, end, hoid->hash, e_inval); ceph_decode_8_safe(p, end, hoid->is_max, e_inval); hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len, GFP_NOIO); if (IS_ERR(hoid->nspace)) { ret = PTR_ERR(hoid->nspace); hoid->nspace = NULL; return ret; } ceph_decode_64_safe(p, end, hoid->pool, e_inval); ceph_hoid_build_hash_cache(hoid); return 0; e_inval: return -EINVAL; } static int hoid_encoding_size(const struct ceph_hobject_id *hoid) { return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */ 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len; } static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid) { ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid)); ceph_encode_string(p, end, hoid->key, hoid->key_len); ceph_encode_string(p, end, hoid->oid, hoid->oid_len); ceph_encode_64(p, hoid->snapid); ceph_encode_32(p, hoid->hash); ceph_encode_8(p, hoid->is_max); ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len); ceph_encode_64(p, hoid->pool); } static void free_hoid(struct ceph_hobject_id *hoid) { if (hoid) { kfree(hoid->key); kfree(hoid->oid); kfree(hoid->nspace); kfree(hoid); } } static struct ceph_osd_backoff *alloc_backoff(void) { struct ceph_osd_backoff *backoff; backoff = kzalloc(sizeof(*backoff), GFP_NOIO); if (!backoff) return NULL; RB_CLEAR_NODE(&backoff->spg_node); RB_CLEAR_NODE(&backoff->id_node); return backoff; } static void free_backoff(struct ceph_osd_backoff *backoff) { WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node)); WARN_ON(!RB_EMPTY_NODE(&backoff->id_node)); free_hoid(backoff->begin); free_hoid(backoff->end); kfree(backoff); } /* * Within a specific spgid, backoffs are managed by ->begin hoid. */ DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare, RB_BYVAL, spg_node); static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root, const struct ceph_hobject_id *hoid) { struct rb_node *n = root->rb_node; while (n) { struct ceph_osd_backoff *cur = rb_entry(n, struct ceph_osd_backoff, spg_node); int cmp; cmp = hoid_compare(hoid, cur->begin); if (cmp < 0) { n = n->rb_left; } else if (cmp > 0) { if (hoid_compare(hoid, cur->end) < 0) return cur; n = n->rb_right; } else { return cur; } } return NULL; } /* * Each backoff has a unique id within its OSD session. */ DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node) static void clear_backoffs(struct ceph_osd *osd) { while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) { struct ceph_spg_mapping *spg = rb_entry(rb_first(&osd->o_backoff_mappings), struct ceph_spg_mapping, node); while (!RB_EMPTY_ROOT(&spg->backoffs)) { struct ceph_osd_backoff *backoff = rb_entry(rb_first(&spg->backoffs), struct ceph_osd_backoff, spg_node); erase_backoff(&spg->backoffs, backoff); erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); free_backoff(backoff); } erase_spg_mapping(&osd->o_backoff_mappings, spg); free_spg_mapping(spg); } } /* * Set up a temporary, non-owning view into @t. */ static void hoid_fill_from_target(struct ceph_hobject_id *hoid, const struct ceph_osd_request_target *t) { hoid->key = NULL; hoid->key_len = 0; hoid->oid = t->target_oid.name; hoid->oid_len = t->target_oid.name_len; hoid->snapid = CEPH_NOSNAP; hoid->hash = t->pgid.seed; hoid->is_max = false; if (t->target_oloc.pool_ns) { hoid->nspace = t->target_oloc.pool_ns->str; hoid->nspace_len = t->target_oloc.pool_ns->len; } else { hoid->nspace = NULL; hoid->nspace_len = 0; } hoid->pool = t->target_oloc.pool; ceph_hoid_build_hash_cache(hoid); } static bool should_plug_request(struct ceph_osd_request *req) { struct ceph_osd *osd = req->r_osd; struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct ceph_hobject_id hoid; spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid); if (!spg) return false; hoid_fill_from_target(&hoid, &req->r_t); backoff = lookup_containing_backoff(&spg->backoffs, &hoid); if (!backoff) return false; dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n", __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool, backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id); return true; } /* * Keep get_num_data_items() in sync with this function. */ static void setup_request_data(struct ceph_osd_request *req) { struct ceph_msg *request_msg = req->r_request; struct ceph_msg *reply_msg = req->r_reply; struct ceph_osd_req_op *op; if (req->r_request->num_data_items || req->r_reply->num_data_items) return; WARN_ON(request_msg->data_length || reply_msg->data_length); for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: WARN_ON(op->indata_len != op->extent.length); ceph_osdc_msg_data_add(request_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: WARN_ON(op->indata_len != op->xattr.name_len + op->xattr.value_len); ceph_osdc_msg_data_add(request_msg, &op->xattr.osd_data); break; case CEPH_OSD_OP_NOTIFY_ACK: ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; case CEPH_OSD_OP_COPY_FROM2: ceph_osdc_msg_data_add(request_msg, &op->copy_from.osd_data); break; /* reply */ case CEPH_OSD_OP_STAT: ceph_osdc_msg_data_add(reply_msg, &op->raw_data_in); break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_SPARSE_READ: ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_LIST_WATCHERS: ceph_osdc_msg_data_add(reply_msg, &op->list_watchers.response_data); break; /* both */ case CEPH_OSD_OP_CALL: WARN_ON(op->indata_len != op->cls.class_len + op->cls.method_len + op->cls.indata_len); ceph_osdc_msg_data_add(request_msg, &op->cls.request_info); /* optional, can be NONE */ ceph_osdc_msg_data_add(request_msg, &op->cls.request_data); /* optional, can be NONE */ ceph_osdc_msg_data_add(reply_msg, &op->cls.response_data); break; case CEPH_OSD_OP_NOTIFY: ceph_osdc_msg_data_add(request_msg, &op->notify.request_data); ceph_osdc_msg_data_add(reply_msg, &op->notify.response_data); break; } } } static void encode_pgid(void **p, const struct ceph_pg *pgid) { ceph_encode_8(p, 1); ceph_encode_64(p, pgid->pool); ceph_encode_32(p, pgid->seed); ceph_encode_32(p, -1); /* preferred */ } static void encode_spgid(void **p, const struct ceph_spg *spgid) { ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1); encode_pgid(p, &spgid->pgid); ceph_encode_8(p, spgid->shard); } static void encode_oloc(void **p, void *end, const struct ceph_object_locator *oloc) { ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc)); ceph_encode_64(p, oloc->pool); ceph_encode_32(p, -1); /* preferred */ ceph_encode_32(p, 0); /* key len */ if (oloc->pool_ns) ceph_encode_string(p, end, oloc->pool_ns->str, oloc->pool_ns->len); else ceph_encode_32(p, 0); } static void encode_request_partial(struct ceph_osd_request *req, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front_alloc_len; u32 data_len = 0; int i; if (req->r_flags & CEPH_OSD_FLAG_WRITE) { /* snapshots aren't writeable */ WARN_ON(req->r_snapid != CEPH_NOSNAP); } else { WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec || req->r_data_offset || req->r_snapc); } setup_request_data(req); encode_spgid(&p, &req->r_t.spgid); /* actual spg */ ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ ceph_encode_32(&p, req->r_osdc->osdmap->epoch); ceph_encode_32(&p, req->r_flags); /* reqid */ ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid)); memset(p, 0, sizeof(struct ceph_osd_reqid)); p += sizeof(struct ceph_osd_reqid); /* trace */ memset(p, 0, sizeof(struct ceph_blkin_trace_info)); p += sizeof(struct ceph_blkin_trace_info); ceph_encode_32(&p, 0); /* client_inc, always 0 */ ceph_encode_timespec64(p, &req->r_mtime); p += sizeof(struct ceph_timespec); encode_oloc(&p, end, &req->r_t.target_oloc); ceph_encode_string(&p, end, req->r_t.target_oid.name, req->r_t.target_oid.name_len); /* ops, can imply data */ ceph_encode_16(&p, req->r_num_ops); for (i = 0; i < req->r_num_ops; i++) { data_len += osd_req_encode_op(p, &req->r_ops[i]); p += sizeof(struct ceph_osd_op); } ceph_encode_64(&p, req->r_snapid); /* snapid */ if (req->r_snapc) { ceph_encode_64(&p, req->r_snapc->seq); ceph_encode_32(&p, req->r_snapc->num_snaps); for (i = 0; i < req->r_snapc->num_snaps; i++) ceph_encode_64(&p, req->r_snapc->snaps[i]); } else { ceph_encode_64(&p, 0); /* snap_seq */ ceph_encode_32(&p, 0); /* snaps len */ } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ BUG_ON(p > end - 8); /* space for features */ msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ /* front_len is finalized in encode_request_finish() */ msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.data_len = cpu_to_le32(data_len); /* * The header "data_off" is a hint to the receiver allowing it * to align received data into its buffers such that there's no * need to re-copy it before writing it to disk (direct I/O). */ msg->hdr.data_off = cpu_to_le16(req->r_data_offset); dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg, req->r_t.target_oid.name, req->r_t.target_oid.name_len); } static void encode_request_finish(struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const partial_end = p + msg->front.iov_len; void *const end = p + msg->front_alloc_len; if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { /* luminous OSD -- encode features and be done */ p = partial_end; ceph_encode_64(&p, msg->con->peer_features); } else { struct { char spgid[CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1]; __le32 hash; __le32 epoch; __le32 flags; char reqid[CEPH_ENCODING_START_BLK_LEN + sizeof(struct ceph_osd_reqid)]; char trace[sizeof(struct ceph_blkin_trace_info)]; __le32 client_inc; struct ceph_timespec mtime; } __packed head; struct ceph_pg pgid; void *oloc, *oid, *tail; int oloc_len, oid_len, tail_len; int len; /* * Pre-luminous OSD -- reencode v8 into v4 using @head * as a temporary buffer. Encode the raw PG; the rest * is just a matter of moving oloc, oid and tail blobs * around. */ memcpy(&head, p, sizeof(head)); p += sizeof(head); oloc = p; p += CEPH_ENCODING_START_BLK_LEN; pgid.pool = ceph_decode_64(&p); p += 4 + 4; /* preferred, key len */ len = ceph_decode_32(&p); p += len; /* nspace */ oloc_len = p - oloc; oid = p; len = ceph_decode_32(&p); p += len; oid_len = p - oid; tail = p; tail_len = partial_end - p; p = msg->front.iov_base; ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch)); ceph_encode_copy(&p, &head.flags, sizeof(head.flags)); ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime)); /* reassert_version */ memset(p, 0, sizeof(struct ceph_eversion)); p += sizeof(struct ceph_eversion); BUG_ON(p >= oloc); memmove(p, oloc, oloc_len); p += oloc_len; pgid.seed = le32_to_cpu(head.hash); encode_pgid(&p, &pgid); /* raw pg */ BUG_ON(p >= oid); memmove(p, oid, oid_len); p += oid_len; /* tail -- ops, snapid, snapc, retry_attempt */ BUG_ON(p >= tail); memmove(p, tail, tail_len); p += tail_len; msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg, le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), le16_to_cpu(msg->hdr.version)); } /* * @req has to be assigned a tid and registered. */ static void send_request(struct ceph_osd_request *req) { struct ceph_osd *osd = req->r_osd; verify_osd_locked(osd); WARN_ON(osd->o_osd != req->r_t.osd); /* backoff? */ if (should_plug_request(req)) return; /* * We may have a previously queued request message hanging * around. Cancel it to avoid corrupting the msgr. */ if (req->r_sent) ceph_msg_revoke(req->r_request); req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; if (req->r_attempts) req->r_flags |= CEPH_OSD_FLAG_RETRY; else WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); encode_request_partial(req, req->r_request); dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n", __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed, req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags, req->r_attempts); req->r_t.paused = false; req->r_stamp = jiffies; req->r_attempts++; req->r_sent = osd->o_incarnation; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); } static void maybe_request_map(struct ceph_osd_client *osdc) { bool continuous = false; verify_osdc_locked(osdc); WARN_ON(!osdc->osdmap->epoch); if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("%s osdc %p continuous\n", __func__, osdc); continuous = true; } else { dout("%s osdc %p onetime\n", __func__, osdc); } if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch + 1, continuous)) ceph_monc_renew_subs(&osdc->client->monc); } static void complete_request(struct ceph_osd_request *req, int err); static void send_map_check(struct ceph_osd_request *req); static void __submit_request(struct ceph_osd_request *req, bool wrlocked) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd *osd; enum calc_target_result ct_res; int err = 0; bool need_send = false; bool promoted = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); if (IS_ERR(osd)) { WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); goto promote; } if (osdc->abort_err) { dout("req %p abort_err %d\n", req, osdc->abort_err); err = osdc->abort_err; } else if (osdc->osdmap->epoch < osdc->epoch_barrier) { dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, osdc->epoch_barrier); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("req %p pauserd\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE)) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) pr_warn_ratelimited("cluster is full (osdmap FULL)\n"); else pr_warn_ratelimited("pool %lld is full or reached quota\n", req->r_t.base_oloc.pool); req->r_t.paused = true; maybe_request_map(osdc); } } else if (!osd_homeless(osd)) { need_send = true; } else { maybe_request_map(osdc); } mutex_lock(&osd->lock); /* * Assign the tid atomically with send_request() to protect * multiple writes to the same object from racing with each * other, resulting in out of order ops on the OSDs. */ req->r_tid = atomic64_inc_return(&osdc->last_tid); link_request(osd, req); if (need_send) send_request(req); else if (err) complete_request(req, err); mutex_unlock(&osd->lock); if (!err && ct_res == CALC_TARGET_POOL_DNE) send_map_check(req); if (promoted) downgrade_write(&osdc->lock); return; promote: up_read(&osdc->lock); down_write(&osdc->lock); wrlocked = true; promoted = true; goto again; } static void account_request(struct ceph_osd_request *req) { WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) { ceph_osdc_get_request(req); account_request(req); __submit_request(req, wrlocked); } static void finish_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); req->r_end_latency = ktime_get(); if (req->r_osd) { ceph_init_sparse_read(&req->r_osd->o_sparse_read); unlink_request(req->r_osd, req); } atomic_dec(&osdc->num_requests); /* * If an OSD has failed or returned and a request has been sent * twice, it's possible to get a reply and end up here while the * request message is queued for delivery. We will ignore the * reply, so not a big deal, but better to try and catch it. */ ceph_msg_revoke(req->r_request); ceph_msg_revoke_incoming(req->r_reply); } static void __complete_request(struct ceph_osd_request *req) { dout("%s req %p tid %llu cb %ps result %d\n", __func__, req, req->r_tid, req->r_callback, req->r_result); if (req->r_callback) req->r_callback(req); complete_all(&req->r_completion); ceph_osdc_put_request(req); } static void complete_request_workfn(struct work_struct *work) { struct ceph_osd_request *req = container_of(work, struct ceph_osd_request, r_complete_work); __complete_request(req); } /* * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); req->r_result = err; finish_request(req); INIT_WORK(&req->r_complete_work, complete_request_workfn); queue_work(req->r_osdc->completion_wq, &req->r_complete_work); } static void cancel_map_check(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_request *lookup_req; verify_osdc_wrlocked(osdc); lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); if (!lookup_req) return; WARN_ON(lookup_req != req); erase_request_mc(&osdc->map_checks, req); ceph_osdc_put_request(req); } static void cancel_request(struct ceph_osd_request *req) { dout("%s req %p tid %llu\n", __func__, req, req->r_tid); cancel_map_check(req); finish_request(req); complete_all(&req->r_completion); ceph_osdc_put_request(req); } static void abort_request(struct ceph_osd_request *req, int err) { dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); cancel_map_check(req); complete_request(req, err); } static int abort_fn(struct ceph_osd_request *req, void *arg) { int err = *(int *)arg; abort_request(req, err); return 0; /* continue iteration */ } /* * Abort all in-flight requests with @err and arrange for all future * requests to be failed immediately. */ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) { dout("%s osdc %p err %d\n", __func__, osdc, err); down_write(&osdc->lock); for_each_request(osdc, abort_fn, &err); osdc->abort_err = err; up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_abort_requests); void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) { down_write(&osdc->lock); osdc->abort_err = 0; up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_clear_abort_err); static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { dout("updating epoch_barrier from %u to %u\n", osdc->epoch_barrier, eb); osdc->epoch_barrier = eb; /* Request map if we're not to the barrier yet */ if (eb > osdc->osdmap->epoch) maybe_request_map(osdc); } } void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { down_read(&osdc->lock); if (unlikely(eb > osdc->epoch_barrier)) { up_read(&osdc->lock); down_write(&osdc->lock); update_epoch_barrier(osdc, eb); up_write(&osdc->lock); } else { up_read(&osdc->lock); } } EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); /* * We can end up releasing caps as a result of abort_request(). * In that case, we probably want to ensure that the cap release message * has an updated epoch barrier in it, so set the epoch barrier prior to * aborting the first request. */ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) { struct ceph_osd_client *osdc = req->r_osdc; bool *victims = arg; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { if (!*victims) { update_epoch_barrier(osdc, osdc->osdmap->epoch); *victims = true; } abort_request(req, -ENOSPC); } return 0; /* continue iteration */ } /* * Drop all pending requests that are stalled waiting on a full condition to * clear, and complete them with ENOSPC as the return code. Set the * osdc->epoch_barrier to the latest map epoch that we've seen if any were * cancelled. */ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { bool victims = false; if (ceph_test_opt(osdc->client, ABORT_ON_FULL) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) for_each_request(osdc, abort_on_full_fn, &victims); } static void check_pool_dne(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osdmap *map = osdc->osdmap; verify_osdc_wrlocked(osdc); WARN_ON(!map->epoch); if (req->r_attempts) { /* * We sent a request earlier, which means that * previously the pool existed, and now it does not * (i.e., it was deleted). */ req->r_map_dne_bound = map->epoch; dout("%s req %p tid %llu pool disappeared\n", __func__, req, req->r_tid); } else { dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__, req, req->r_tid, req->r_map_dne_bound, map->epoch); } if (req->r_map_dne_bound) { if (map->epoch >= req->r_map_dne_bound) { /* we had a new enough map */ pr_info_ratelimited("tid %llu pool does not exist\n", req->r_tid); complete_request(req, -ENOENT); } } else { send_map_check(req); } } static void map_check_cb(struct ceph_mon_generic_request *greq) { struct ceph_osd_client *osdc = &greq->monc->client->osdc; struct ceph_osd_request *req; u64 tid = greq->private_data; WARN_ON(greq->result || !greq->u.newest); down_write(&osdc->lock); req = lookup_request_mc(&osdc->map_checks, tid); if (!req) { dout("%s tid %llu dne\n", __func__, tid); goto out_unlock; } dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__, req, req->r_tid, req->r_map_dne_bound, greq->u.newest); if (!req->r_map_dne_bound) req->r_map_dne_bound = greq->u.newest; erase_request_mc(&osdc->map_checks, req); check_pool_dne(req); ceph_osdc_put_request(req); out_unlock: up_write(&osdc->lock); } static void send_map_check(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_request *lookup_req; int ret; verify_osdc_wrlocked(osdc); lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); if (lookup_req) { WARN_ON(lookup_req != req); return; } ceph_osdc_get_request(req); insert_request_mc(&osdc->map_checks, req); ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", map_check_cb, req->r_tid); WARN_ON(ret); } /* * lingering requests, watch/notify v2 infrastructure */ static void linger_release(struct kref *kref) { struct ceph_osd_linger_request *lreq = container_of(kref, struct ceph_osd_linger_request, kref); dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq, lreq->reg_req, lreq->ping_req); WARN_ON(!RB_EMPTY_NODE(&lreq->node)); WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node)); WARN_ON(!list_empty(&lreq->scan_item)); WARN_ON(!list_empty(&lreq->pending_lworks)); WARN_ON(lreq->osd); if (lreq->request_pl) ceph_pagelist_release(lreq->request_pl); if (lreq->notify_id_pages) ceph_release_page_vector(lreq->notify_id_pages, 1); ceph_osdc_put_request(lreq->reg_req); ceph_osdc_put_request(lreq->ping_req); target_destroy(&lreq->t); kfree(lreq); } static void linger_put(struct ceph_osd_linger_request *lreq) { if (lreq) kref_put(&lreq->kref, linger_release); } static struct ceph_osd_linger_request * linger_get(struct ceph_osd_linger_request *lreq) { kref_get(&lreq->kref); return lreq; } static struct ceph_osd_linger_request * linger_alloc(struct ceph_osd_client *osdc) { struct ceph_osd_linger_request *lreq; lreq = kzalloc(sizeof(*lreq), GFP_NOIO); if (!lreq) return NULL; kref_init(&lreq->kref); mutex_init(&lreq->lock); RB_CLEAR_NODE(&lreq->node); RB_CLEAR_NODE(&lreq->osdc_node); RB_CLEAR_NODE(&lreq->mc_node); INIT_LIST_HEAD(&lreq->scan_item); INIT_LIST_HEAD(&lreq->pending_lworks); init_completion(&lreq->reg_commit_wait); init_completion(&lreq->notify_finish_wait); lreq->osdc = osdc; target_init(&lreq->t); dout("%s lreq %p\n", __func__, lreq); return lreq; } DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node) /* * Create linger request <-> OSD session relation. * * @lreq has to be registered, @osd may be homeless. */ static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq) { verify_osd_locked(osd); WARN_ON(!lreq->linger_id || lreq->osd); dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, osd->o_osd, lreq, lreq->linger_id); if (!osd_homeless(osd)) __remove_osd_from_lru(osd); else atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); insert_linger(&osd->o_linger_requests, lreq); lreq->osd = osd; } static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq) { verify_osd_locked(osd); WARN_ON(lreq->osd != osd); dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, osd->o_osd, lreq, lreq->linger_id); lreq->osd = NULL; erase_linger(&osd->o_linger_requests, lreq); put_osd(osd); if (!osd_homeless(osd)) maybe_move_osd_to_lru(osd); else atomic_dec(&osd->o_osdc->num_homeless); } static bool __linger_registered(struct ceph_osd_linger_request *lreq) { verify_osdc_locked(lreq->osdc); return !RB_EMPTY_NODE(&lreq->osdc_node); } static bool linger_registered(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; bool registered; down_read(&osdc->lock); registered = __linger_registered(lreq); up_read(&osdc->lock); return registered; } static void linger_register(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; verify_osdc_wrlocked(osdc); WARN_ON(lreq->linger_id); linger_get(lreq); lreq->linger_id = ++osdc->last_linger_id; insert_linger_osdc(&osdc->linger_requests, lreq); } static void linger_unregister(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; verify_osdc_wrlocked(osdc); erase_linger_osdc(&osdc->linger_requests, lreq); linger_put(lreq); } static void cancel_linger_request(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; WARN_ON(!req->r_linger); cancel_request(req); linger_put(lreq); } struct linger_work { struct work_struct work; struct ceph_osd_linger_request *lreq; struct list_head pending_item; unsigned long queued_stamp; union { struct { u64 notify_id; u64 notifier_id; void *payload; /* points into @msg front */ size_t payload_len; struct ceph_msg *msg; /* for ceph_msg_put() */ } notify; struct { int err; } error; }; }; static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, work_func_t workfn) { struct linger_work *lwork; lwork = kzalloc(sizeof(*lwork), GFP_NOIO); if (!lwork) return NULL; INIT_WORK(&lwork->work, workfn); INIT_LIST_HEAD(&lwork->pending_item); lwork->lreq = linger_get(lreq); return lwork; } static void lwork_free(struct linger_work *lwork) { struct ceph_osd_linger_request *lreq = lwork->lreq; mutex_lock(&lreq->lock); list_del(&lwork->pending_item); mutex_unlock(&lreq->lock); linger_put(lreq); kfree(lwork); } static void lwork_queue(struct linger_work *lwork) { struct ceph_osd_linger_request *lreq = lwork->lreq; struct ceph_osd_client *osdc = lreq->osdc; verify_lreq_locked(lreq); WARN_ON(!list_empty(&lwork->pending_item)); lwork->queued_stamp = jiffies; list_add_tail(&lwork->pending_item, &lreq->pending_lworks); queue_work(osdc->notify_wq, &lwork->work); } static void do_watch_notify(struct work_struct *w) { struct linger_work *lwork = container_of(w, struct linger_work, work); struct ceph_osd_linger_request *lreq = lwork->lreq; if (!linger_registered(lreq)) { dout("%s lreq %p not registered\n", __func__, lreq); goto out; } WARN_ON(!lreq->is_watch); dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, lwork->notify.payload_len); lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id, lwork->notify.notifier_id, lwork->notify.payload, lwork->notify.payload_len); out: ceph_msg_put(lwork->notify.msg); lwork_free(lwork); } static void do_watch_error(struct work_struct *w) { struct linger_work *lwork = container_of(w, struct linger_work, work); struct ceph_osd_linger_request *lreq = lwork->lreq; if (!linger_registered(lreq)) { dout("%s lreq %p not registered\n", __func__, lreq); goto out; } dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err); lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err); out: lwork_free(lwork); } static void queue_watch_error(struct ceph_osd_linger_request *lreq) { struct linger_work *lwork; lwork = lwork_alloc(lreq, do_watch_error); if (!lwork) { pr_err("failed to allocate error-lwork\n"); return; } lwork->error.err = lreq->last_error; lwork_queue(lwork); } static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq, int result) { if (!completion_done(&lreq->reg_commit_wait)) { lreq->reg_commit_error = (result <= 0 ? result : 0); complete_all(&lreq->reg_commit_wait); } } static void linger_commit_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->reg_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->reg_req); goto out; } dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; if (!lreq->is_watch) { struct ceph_osd_data *osd_data = osd_req_op_data(req, 0, notify, response_data); void *p = page_address(osd_data->pages[0]); WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY || osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); /* make note of the notify_id */ if (req->r_ops[0].outdata_len >= sizeof(u64)) { lreq->notify_id = ceph_decode_64(&p); dout("lreq %p notify_id %llu\n", lreq, lreq->notify_id); } else { dout("lreq %p no notify_id\n", lreq); } } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static int normalize_watch_error(int err) { /* * Translate ENOENT -> ENOTCONN so that a delete->disconnection * notification and a failure to reconnect because we raced with * the delete appear the same to the user. */ if (err == -ENOENT) err = -ENOTCONN; return err; } static void linger_reconnect_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->reg_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->reg_req); goto out; } dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->last_error); if (req->r_result < 0) { if (!lreq->last_error) { lreq->last_error = normalize_watch_error(req->r_result); queue_watch_error(lreq); } } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_request *req; int ret; verify_osdc_wrlocked(osdc); mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); if (lreq->reg_req) { if (lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); ceph_osdc_put_request(lreq->reg_req); } req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); BUG_ON(!req); target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; if (lreq->is_watch && lreq->committed) { osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT, lreq->linger_id, ++lreq->register_gen); dout("lreq %p reconnect register_gen %u\n", lreq, req->r_ops[0].watch.gen); req->r_callback = linger_reconnect_cb; } else { if (lreq->is_watch) { osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH, lreq->linger_id, 0); } else { lreq->notify_id = 0; refcount_inc(&lreq->request_pl->refcnt); osd_req_op_notify_init(req, 0, lreq->linger_id, lreq->request_pl); ceph_osd_data_pages_init( osd_req_op_data(req, 0, notify, response_data), lreq->notify_id_pages, PAGE_SIZE, 0, false, false); } dout("lreq %p register\n", lreq); req->r_callback = linger_commit_cb; } ret = ceph_osdc_alloc_messages(req, GFP_NOIO); BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; lreq->reg_req = req; mutex_unlock(&lreq->lock); submit_request(req, true); } static void linger_ping_cb(struct ceph_osd_request *req) { struct ceph_osd_linger_request *lreq = req->r_priv; mutex_lock(&lreq->lock); if (req != lreq->ping_req) { dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n", __func__, lreq, lreq->linger_id, req, lreq->ping_req); goto out; } dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, lreq->last_error); if (lreq->register_gen == req->r_ops[0].watch.gen) { if (!req->r_result) { lreq->watch_valid_thru = lreq->ping_sent; } else if (!lreq->last_error) { lreq->last_error = normalize_watch_error(req->r_result); queue_watch_error(lreq); } } else { dout("lreq %p register_gen %u ignoring old pong %u\n", lreq, lreq->register_gen, req->r_ops[0].watch.gen); } out: mutex_unlock(&lreq->lock); linger_put(lreq); } static void send_linger_ping(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_request *req; int ret; if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); return; } lreq->ping_sent = jiffies; dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n", __func__, lreq, lreq->linger_id, lreq->ping_sent, lreq->register_gen); if (lreq->ping_req) { if (lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); ceph_osdc_put_request(lreq->ping_req); } req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO); BUG_ON(!req); target_copy(&req->r_t, &lreq->t); osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id, lreq->register_gen); req->r_callback = linger_ping_cb; ret = ceph_osdc_alloc_messages(req, GFP_NOIO); BUG_ON(ret); req->r_priv = linger_get(lreq); req->r_linger = true; lreq->ping_req = req; ceph_osdc_get_request(req); account_request(req); req->r_tid = atomic64_inc_return(&osdc->last_tid); link_request(lreq->osd, req); send_request(req); } static void linger_submit(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; down_write(&osdc->lock); linger_register(lreq); calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); send_linger(lreq); up_write(&osdc->lock); } static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_linger_request *lookup_lreq; verify_osdc_wrlocked(osdc); lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, lreq->linger_id); if (!lookup_lreq) return; WARN_ON(lookup_lreq != lreq); erase_linger_mc(&osdc->linger_map_checks, lreq); linger_put(lreq); } /* * @lreq has to be both registered and linked. */ static void __linger_cancel(struct ceph_osd_linger_request *lreq) { if (lreq->ping_req && lreq->ping_req->r_osd) cancel_linger_request(lreq->ping_req); if (lreq->reg_req && lreq->reg_req->r_osd) cancel_linger_request(lreq->reg_req); cancel_linger_map_check(lreq); unlink_linger(lreq->osd, lreq); linger_unregister(lreq); } static void linger_cancel(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; down_write(&osdc->lock); if (__linger_registered(lreq)) __linger_cancel(lreq); up_write(&osdc->lock); } static void send_linger_map_check(struct ceph_osd_linger_request *lreq); static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osdmap *map = osdc->osdmap; verify_osdc_wrlocked(osdc); WARN_ON(!map->epoch); if (lreq->register_gen) { lreq->map_dne_bound = map->epoch; dout("%s lreq %p linger_id %llu pool disappeared\n", __func__, lreq, lreq->linger_id); } else { dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n", __func__, lreq, lreq->linger_id, lreq->map_dne_bound, map->epoch); } if (lreq->map_dne_bound) { if (map->epoch >= lreq->map_dne_bound) { /* we had a new enough map */ pr_info("linger_id %llu pool does not exist\n", lreq->linger_id); linger_reg_commit_complete(lreq, -ENOENT); __linger_cancel(lreq); } } else { send_linger_map_check(lreq); } } static void linger_map_check_cb(struct ceph_mon_generic_request *greq) { struct ceph_osd_client *osdc = &greq->monc->client->osdc; struct ceph_osd_linger_request *lreq; u64 linger_id = greq->private_data; WARN_ON(greq->result || !greq->u.newest); down_write(&osdc->lock); lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id); if (!lreq) { dout("%s linger_id %llu dne\n", __func__, linger_id); goto out_unlock; } dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n", __func__, lreq, lreq->linger_id, lreq->map_dne_bound, greq->u.newest); if (!lreq->map_dne_bound) lreq->map_dne_bound = greq->u.newest; erase_linger_mc(&osdc->linger_map_checks, lreq); check_linger_pool_dne(lreq); linger_put(lreq); out_unlock: up_write(&osdc->lock); } static void send_linger_map_check(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_linger_request *lookup_lreq; int ret; verify_osdc_wrlocked(osdc); lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, lreq->linger_id); if (lookup_lreq) { WARN_ON(lookup_lreq != lreq); return; } linger_get(lreq); insert_linger_mc(&osdc->linger_map_checks, lreq); ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", linger_map_check_cb, lreq->linger_id); WARN_ON(ret); } static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) { int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, unsigned long timeout) { long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, ceph_timeout_jiffies(timeout)); if (left <= 0) left = left ?: -ETIMEDOUT; else left = lreq->notify_finish_error; /* completed */ return left; } /* * Timeout callback, called every N seconds. When 1 or more OSD * requests has been active for more than N seconds, we send a keepalive * (tag + timestamp) to its OSD to ensure any communications channel * reset is detected. */ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_options *opts = osdc->client->options; unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout; LIST_HEAD(slow_osds); struct rb_node *n, *p; dout("%s osdc %p\n", __func__, osdc); down_write(&osdc->lock); /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen * a connection with that osd (from the fault callback). */ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); bool found = false; for (p = rb_first(&osd->o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); /* abort_request() */ if (time_before(req->r_stamp, cutoff)) { dout(" req %p tid %llu on osd%d is laggy\n", req, req->r_tid, osd->o_osd); found = true; } if (opts->osd_request_timeout && time_before(req->r_start_stamp, expiry_cutoff)) { pr_err_ratelimited("tid %llu on osd%d timeout\n", req->r_tid, osd->o_osd); abort_request(req, -ETIMEDOUT); } } for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { struct ceph_osd_linger_request *lreq = rb_entry(p, struct ceph_osd_linger_request, node); dout(" lreq %p linger_id %llu is served by osd%d\n", lreq, lreq->linger_id, osd->o_osd); found = true; mutex_lock(&lreq->lock); if (lreq->is_watch && lreq->committed && !lreq->last_error) send_linger_ping(lreq); mutex_unlock(&lreq->lock); } if (found) list_move_tail(&osd->o_keepalive_item, &slow_osds); } if (opts->osd_request_timeout) { for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); p = rb_next(p); /* abort_request() */ if (time_before(req->r_start_stamp, expiry_cutoff)) { pr_err_ratelimited("tid %llu on osd%d timeout\n", req->r_tid, osdc->homeless_osd.o_osd); abort_request(req, -ETIMEDOUT); } } } if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) maybe_request_map(osdc); while (!list_empty(&slow_osds)) { struct ceph_osd *osd = list_first_entry(&slow_osds, struct ceph_osd, o_keepalive_item); list_del_init(&osd->o_keepalive_item); ceph_con_keepalive(&osd->o_con); } up_write(&osdc->lock); schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); } static void handle_osds_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = osdc->client->options->osd_idle_ttl / 4; struct ceph_osd *osd, *nosd; dout("%s osdc %p\n", __func__, osdc); down_write(&osdc->lock); list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { if (time_before(jiffies, osd->lru_ttl)) break; WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); close_osd(osd); } up_write(&osdc->lock); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(delay)); } static int ceph_oloc_decode(void **p, void *end, struct ceph_object_locator *oloc) { u8 struct_v, struct_cv; u32 len; void *struct_end; int ret = 0; ceph_decode_need(p, end, 1 + 1 + 4, e_inval); struct_v = ceph_decode_8(p); struct_cv = ceph_decode_8(p); if (struct_v < 3) { pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", struct_v, struct_cv); goto e_inval; } if (struct_cv > 6) { pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", struct_v, struct_cv); goto e_inval; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, e_inval); struct_end = *p + len; oloc->pool = ceph_decode_64(p); *p += 4; /* skip preferred */ len = ceph_decode_32(p); if (len > 0) { pr_warn("ceph_object_locator::key is set\n"); goto e_inval; } if (struct_v >= 5) { bool changed = false; len = ceph_decode_32(p); if (len > 0) { ceph_decode_need(p, end, len, e_inval); if (!oloc->pool_ns || ceph_compare_string(oloc->pool_ns, *p, len)) changed = true; *p += len; } else { if (oloc->pool_ns) changed = true; } if (changed) { /* redirect changes namespace */ pr_warn("ceph_object_locator::nspace is changed\n"); goto e_inval; } } if (struct_v >= 6) { s64 hash = ceph_decode_64(p); if (hash != -1) { pr_warn("ceph_object_locator::hash is set\n"); goto e_inval; } } /* skip the rest */ *p = struct_end; out: return ret; e_inval: ret = -EINVAL; goto out; } static int ceph_redirect_decode(void **p, void *end, struct ceph_request_redirect *redir) { u8 struct_v, struct_cv; u32 len; void *struct_end; int ret; ceph_decode_need(p, end, 1 + 1 + 4, e_inval); struct_v = ceph_decode_8(p); struct_cv = ceph_decode_8(p); if (struct_cv > 1) { pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", struct_v, struct_cv); goto e_inval; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, e_inval); struct_end = *p + len; ret = ceph_oloc_decode(p, end, &redir->oloc); if (ret) goto out; len = ceph_decode_32(p); if (len > 0) { pr_warn("ceph_request_redirect::object_name is set\n"); goto e_inval; } /* skip the rest */ *p = struct_end; out: return ret; e_inval: ret = -EINVAL; goto out; } struct MOSDOpReply { struct ceph_pg pgid; u64 flags; int result; u32 epoch; int num_ops; u32 outdata_len[CEPH_OSD_MAX_OPS]; s32 rval[CEPH_OSD_MAX_OPS]; int retry_attempt; struct ceph_eversion replay_version; u64 user_version; struct ceph_request_redirect redirect; }; static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u16 version = le16_to_cpu(msg->hdr.version); struct ceph_eversion bad_replay_version; u8 decode_redir; u32 len; int ret; int i; ceph_decode_32_safe(&p, end, len, e_inval); ceph_decode_need(&p, end, len, e_inval); p += len; /* skip oid */ ret = ceph_decode_pgid(&p, end, &m->pgid); if (ret) return ret; ceph_decode_64_safe(&p, end, m->flags, e_inval); ceph_decode_32_safe(&p, end, m->result, e_inval); ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval); memcpy(&bad_replay_version, p, sizeof(bad_replay_version)); p += sizeof(bad_replay_version); ceph_decode_32_safe(&p, end, m->epoch, e_inval); ceph_decode_32_safe(&p, end, m->num_ops, e_inval); if (m->num_ops > ARRAY_SIZE(m->outdata_len)) goto e_inval; ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op), e_inval); for (i = 0; i < m->num_ops; i++) { struct ceph_osd_op *op = p; m->outdata_len[i] = le32_to_cpu(op->payload_len); p += sizeof(*op); } ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval); for (i = 0; i < m->num_ops; i++) ceph_decode_32_safe(&p, end, m->rval[i], e_inval); if (version >= 5) { ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval); memcpy(&m->replay_version, p, sizeof(m->replay_version)); p += sizeof(m->replay_version); ceph_decode_64_safe(&p, end, m->user_version, e_inval); } else { m->replay_version = bad_replay_version; /* struct */ m->user_version = le64_to_cpu(m->replay_version.version); } if (version >= 6) { if (version >= 7) ceph_decode_8_safe(&p, end, decode_redir, e_inval); else decode_redir = 1; } else { decode_redir = 0; } if (decode_redir) { ret = ceph_redirect_decode(&p, end, &m->redirect); if (ret) return ret; } else { ceph_oloc_init(&m->redirect.oloc); } return 0; e_inval: return -EINVAL; } /* * Handle MOSDOpReply. Set ->r_result and call the callback if it is * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_osd_request *req; struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; int ret; int i; dout("%s msg %p tid %llu\n", __func__, msg, tid); down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); goto out_unlock_osdc; } WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); mutex_lock(&osd->lock); req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid); goto out_unlock_session; } m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns; ret = decode_MOSDOpReply(msg, &m); m.redirect.oloc.pool_ns = NULL; if (ret) { pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", req->r_tid, ret); ceph_msg_dump(msg); goto fail_request; } dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n", __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed, m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch), le64_to_cpu(m.replay_version.version), m.user_version); if (m.retry_attempt >= 0) { if (m.retry_attempt != req->r_attempts - 1) { dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", req, req->r_tid, m.retry_attempt, req->r_attempts - 1); goto out_unlock_session; } } else { WARN_ON(1); /* MOSDOpReply v4 is assumed */ } if (!ceph_oloc_empty(&m.redirect.oloc)) { dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, m.redirect.oloc.pool); unlink_request(osd, req); mutex_unlock(&osd->lock); /* * Not ceph_oloc_copy() - changing pool_ns is not * supported. */ req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_IGNORE_CACHE; req->r_tid = 0; __submit_request(req, false); goto out_unlock_osdc; } if (m.result == -EAGAIN) { dout("req %p tid %llu EAGAIN\n", req, req->r_tid); unlink_request(osd, req); mutex_unlock(&osd->lock); /* * The object is missing on the replica or not (yet) * readable. Clear pgid to force a resend to the primary * via legacy_change. */ req->r_t.pgid.pool = 0; req->r_t.pgid.seed = 0; WARN_ON(!req->r_t.used_replica); req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | CEPH_OSD_FLAG_LOCALIZE_READS); req->r_tid = 0; __submit_request(req, false); goto out_unlock_osdc; } if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); goto fail_request; } for (i = 0; i < req->r_num_ops; i++) { dout(" req %p tid %llu op %d rval %d len %u\n", req, req->r_tid, i, m.rval[i], m.outdata_len[i]); req->r_ops[i].rval = m.rval[i]; req->r_ops[i].outdata_len = m.outdata_len[i]; data_len += m.outdata_len[i]; } if (data_len != le32_to_cpu(msg->hdr.data_len)) { pr_err("sum of lens %u != %u for tid %llu\n", data_len, le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } dout("%s req %p tid %llu result %d data_len %u\n", __func__, req, req->r_tid, m.result, data_len); /* * Since we only ever request ONDISK, we should only ever get * one (type of) reply back. */ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); req->r_version = m.user_version; req->r_result = m.result ?: data_len; finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); __complete_request(req); return; fail_request: complete_request(req, -EIO); out_unlock_session: mutex_unlock(&osd->lock); out_unlock_osdc: up_read(&osdc->lock); } static void set_pool_was_full(struct ceph_osd_client *osdc) { struct rb_node *n; for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); pi->was_full = __pool_full(pi); } } static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); if (!pi) return false; return pi->was_full && !__pool_full(pi); } static enum calc_target_result recalc_linger_target(struct ceph_osd_linger_request *lreq) { struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; osd = lookup_create_osd(osdc, lreq->t.osd, true); if (osd != lreq->osd) { unlink_linger(lreq->osd, lreq); link_linger(osd, lreq); } } return ct_res; } /* * Requeue requests whose mapping to an OSD has changed. */ static void scan_requests(struct ceph_osd *osd, bool force_resend, bool cleared_full, bool check_pool_cleared_full, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osd_client *osdc = osd->o_osdc; struct rb_node *n; bool force_resend_writes; for (n = rb_first(&osd->o_linger_requests); n; ) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); enum calc_target_result ct_res; n = rb_next(n); /* recalc_linger_target() */ dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); ct_res = recalc_linger_target(lreq); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || (check_pool_cleared_full && pool_cleared_full(osdc, lreq->t.base_oloc.pool)); if (!force_resend && !force_resend_writes) break; fallthrough; case CALC_TARGET_NEED_RESEND: cancel_linger_map_check(lreq); /* * scan_requests() for the previous epoch(s) * may have already added it to the list, since * it's not unlinked here. */ if (list_empty(&lreq->scan_item)) list_add_tail(&lreq->scan_item, need_resend_linger); break; case CALC_TARGET_POOL_DNE: list_del_init(&lreq->scan_item); check_linger_pool_dne(lreq); break; } } for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); enum calc_target_result ct_res; n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || (check_pool_cleared_full && pool_cleared_full(osdc, req->r_t.base_oloc.pool)); if (!force_resend && (!(req->r_flags & CEPH_OSD_FLAG_WRITE) || !force_resend_writes)) break; fallthrough; case CALC_TARGET_NEED_RESEND: cancel_map_check(req); unlink_request(osd, req); insert_request(need_resend, req); break; case CALC_TARGET_POOL_DNE: check_pool_dne(req); break; } } } static int handle_one_map(struct ceph_osd_client *osdc, void *p, void *end, bool incremental, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osdmap *newmap; struct rb_node *n; bool skipped_map = false; bool was_full; was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); set_pool_was_full(osdc); if (incremental) newmap = osdmap_apply_incremental(&p, end, ceph_msgr2(osdc->client), osdc->osdmap); else newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); if (newmap != osdc->osdmap) { /* * Preserve ->was_full before destroying the old map. * For pools that weren't in the old map, ->was_full * should be false. */ for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); struct ceph_pg_pool_info *old_pi; old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id); if (old_pi) pi->was_full = old_pi->was_full; else WARN_ON(pi->was_full); } if (osdc->osdmap->epoch && osdc->osdmap->epoch + 1 < newmap->epoch) { WARN_ON(incremental); skipped_map = true; } ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, need_resend, need_resend_linger); for (n = rb_first(&osdc->osds); n; ) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); n = rb_next(n); /* close_osd() */ scan_requests(osd, skipped_map, was_full, true, need_resend, need_resend_linger); if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || memcmp(&osd->o_con.peer_addr, ceph_osd_addr(osdc->osdmap, osd->o_osd), sizeof(struct ceph_entity_addr))) close_osd(osd); } return 0; } static void kick_requests(struct ceph_osd_client *osdc, struct rb_root *need_resend, struct list_head *need_resend_linger) { struct ceph_osd_linger_request *lreq, *nlreq; enum calc_target_result ct_res; struct rb_node *n; /* make sure need_resend targets reflect latest map */ for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); } } } for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); struct ceph_osd *osd; n = rb_next(n); erase_request(need_resend, req); /* before link_request() */ osd = lookup_create_osd(osdc, req->r_t.osd, true); link_request(osd, req); if (!req->r_linger) { if (!osd_homeless(osd) && !req->r_t.paused) send_request(req); } else { cancel_linger_request(req); } } list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) { if (!osd_homeless(lreq->osd)) send_linger(lreq); list_del_init(&lreq->scan_item); } } /* * Process updated osd map. * * The message contains any number of incremental and full maps, normally * indicating some sort of topology change in the cluster. Kick requests * off to different OSDs as needed. */ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u32 nr_maps, maplen; u32 epoch; struct ceph_fsid fsid; struct rb_root need_resend = RB_ROOT; LIST_HEAD(need_resend_linger); bool handled_incremental = false; bool was_pauserd, was_pausewr; bool pauserd, pausewr; int err; dout("%s have %u\n", __func__, osdc->osdmap->epoch); down_write(&osdc->lock); /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(osdc->client, &fsid) < 0) goto bad; was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d inc maps\n", nr_maps); while (nr_maps > 0) { ceph_decode_need(&p, end, 2*sizeof(u32), bad); epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); if (osdc->osdmap->epoch && osdc->osdmap->epoch + 1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); err = handle_one_map(osdc, p, p + maplen, true, &need_resend, &need_resend_linger); if (err) goto bad; handled_incremental = true; } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); } p += maplen; nr_maps--; } if (handled_incremental) goto done; /* full maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d full maps\n", nr_maps); while (nr_maps) { ceph_decode_need(&p, end, 2*sizeof(u32), bad); epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); if (nr_maps > 1) { dout("skipping non-latest full map %u len %d\n", epoch, maplen); } else if (osdc->osdmap->epoch >= epoch) { dout("skipping full map %u len %d, " "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); } else { dout("taking full map %u len %d\n", epoch, maplen); err = handle_one_map(osdc, p, p + maplen, false, &need_resend, &need_resend_linger); if (err) goto bad; } p += maplen; nr_maps--; } done: /* * subscribe to subsequent osdmap updates if full to ensure * we find out when we are no longer full and stop returning * ENOSPC. */ pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); if (was_pauserd || was_pausewr || pauserd || pausewr || osdc->osdmap->epoch < osdc->epoch_barrier) maybe_request_map(osdc); kick_requests(osdc, &need_resend, &need_resend_linger); ceph_osdc_abort_on_full(osdc); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch); up_write(&osdc->lock); wake_up_all(&osdc->client->auth_wq); return; bad: pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); up_write(&osdc->lock); } /* * Resubmit requests pending on the given osd. */ static void kick_osd_requests(struct ceph_osd *osd) { struct rb_node *n; clear_backoffs(osd); for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); n = rb_next(n); /* cancel_linger_request() */ if (!req->r_linger) { if (!req->r_t.paused) send_request(req); } else { cancel_linger_request(req); } } for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { struct ceph_osd_linger_request *lreq = rb_entry(n, struct ceph_osd_linger_request, node); send_linger(lreq); } } /* * If the osd connection drops, we need to resubmit all requests. */ static void osd_fault(struct ceph_connection *con) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); down_write(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); goto out_unlock; } if (!reopen_osd(osd)) kick_osd_requests(osd); maybe_request_map(osdc); out_unlock: up_write(&osdc->lock); } struct MOSDBackoff { struct ceph_spg spgid; u32 map_epoch; u8 op; u64 id; struct ceph_hobject_id *begin; struct ceph_hobject_id *end; }; static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len); if (ret) return ret; ret = ceph_decode_pgid(&p, end, &m->spgid.pgid); if (ret) return ret; ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval); ceph_decode_32_safe(&p, end, m->map_epoch, e_inval); ceph_decode_8_safe(&p, end, m->op, e_inval); ceph_decode_64_safe(&p, end, m->id, e_inval); m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); if (!m->begin) return -ENOMEM; ret = decode_hoid(&p, end, m->begin); if (ret) { free_hoid(m->begin); return ret; } m->end = kzalloc(sizeof(*m->end), GFP_NOIO); if (!m->end) { free_hoid(m->begin); return -ENOMEM; } ret = decode_hoid(&p, end, m->end); if (ret) { free_hoid(m->begin); free_hoid(m->end); return ret; } return 0; e_inval: return -EINVAL; } static struct ceph_msg *create_backoff_message( const struct ceph_osd_backoff *backoff, u32 map_epoch) { struct ceph_msg *msg; void *p, *end; int msg_size; msg_size = CEPH_ENCODING_START_BLK_LEN + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ msg_size += 4 + 1 + 8; /* map_epoch, op, id */ msg_size += CEPH_ENCODING_START_BLK_LEN + hoid_encoding_size(backoff->begin); msg_size += CEPH_ENCODING_START_BLK_LEN + hoid_encoding_size(backoff->end); msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true); if (!msg) return NULL; p = msg->front.iov_base; end = p + msg->front_alloc_len; encode_spgid(&p, &backoff->spgid); ceph_encode_32(&p, map_epoch); ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK); ceph_encode_64(&p, backoff->id); encode_hoid(&p, end, backoff->begin); encode_hoid(&p, end, backoff->end); BUG_ON(p != end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return msg; } static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m) { struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct ceph_msg *msg; dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid); if (!spg) { spg = alloc_spg_mapping(); if (!spg) { pr_err("%s failed to allocate spg\n", __func__); return; } spg->spgid = m->spgid; /* struct */ insert_spg_mapping(&osd->o_backoff_mappings, spg); } backoff = alloc_backoff(); if (!backoff) { pr_err("%s failed to allocate backoff\n", __func__); return; } backoff->spgid = m->spgid; /* struct */ backoff->id = m->id; backoff->begin = m->begin; m->begin = NULL; /* backoff now owns this */ backoff->end = m->end; m->end = NULL; /* ditto */ insert_backoff(&spg->backoffs, backoff); insert_backoff_by_id(&osd->o_backoffs_by_id, backoff); /* * Ack with original backoff's epoch so that the OSD can * discard this if there was a PG split. */ msg = create_backoff_message(backoff, m->map_epoch); if (!msg) { pr_err("%s failed to allocate msg\n", __func__); return; } ceph_con_send(&osd->o_con, msg); } static bool target_contained_by(const struct ceph_osd_request_target *t, const struct ceph_hobject_id *begin, const struct ceph_hobject_id *end) { struct ceph_hobject_id hoid; int cmp; hoid_fill_from_target(&hoid, t); cmp = hoid_compare(&hoid, begin); return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0); } static void handle_backoff_unblock(struct ceph_osd *osd, const struct MOSDBackoff *m) { struct ceph_spg_mapping *spg; struct ceph_osd_backoff *backoff; struct rb_node *n; dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id); if (!backoff) { pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); return; } if (hoid_compare(backoff->begin, m->begin) && hoid_compare(backoff->end, m->end)) { pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n", __func__, osd->o_osd, m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); /* unblock it anyway... */ } spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid); BUG_ON(!spg); erase_backoff(&spg->backoffs, backoff); erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); free_backoff(backoff); if (RB_EMPTY_ROOT(&spg->backoffs)) { erase_spg_mapping(&osd->o_backoff_mappings, spg); free_spg_mapping(spg); } for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) { /* * Match against @m, not @backoff -- the PG may * have split on the OSD. */ if (target_contained_by(&req->r_t, m->begin, m->end)) { /* * If no other installed backoff applies, * resend. */ send_request(req); } } } } static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg) { struct ceph_osd_client *osdc = osd->o_osdc; struct MOSDBackoff m; int ret; down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown\n", __func__, osd->o_osd); up_read(&osdc->lock); return; } WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); mutex_lock(&osd->lock); ret = decode_MOSDBackoff(msg, &m); if (ret) { pr_err("failed to decode MOSDBackoff: %d\n", ret); ceph_msg_dump(msg); goto out_unlock; } switch (m.op) { case CEPH_OSD_BACKOFF_OP_BLOCK: handle_backoff_block(osd, &m); break; case CEPH_OSD_BACKOFF_OP_UNBLOCK: handle_backoff_unblock(osd, &m); break; default: pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op); } free_hoid(m.begin); free_hoid(m.end); out_unlock: mutex_unlock(&osd->lock); up_read(&osdc->lock); } /* * Process osd watch notifications */ static void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front.iov_len; struct ceph_osd_linger_request *lreq; struct linger_work *lwork; u8 proto_ver, opcode; u64 cookie, notify_id; u64 notifier_id = 0; s32 return_code = 0; void *payload = NULL; u32 payload_len = 0; ceph_decode_8_safe(&p, end, proto_ver, bad); ceph_decode_8_safe(&p, end, opcode, bad); ceph_decode_64_safe(&p, end, cookie, bad); p += 8; /* skip ver */ ceph_decode_64_safe(&p, end, notify_id, bad); if (proto_ver >= 1) { ceph_decode_32_safe(&p, end, payload_len, bad); ceph_decode_need(&p, end, payload_len, bad); payload = p; p += payload_len; } if (le16_to_cpu(msg->hdr.version) >= 2) ceph_decode_32_safe(&p, end, return_code, bad); if (le16_to_cpu(msg->hdr.version) >= 3) ceph_decode_64_safe(&p, end, notifier_id, bad); down_read(&osdc->lock); lreq = lookup_linger_osdc(&osdc->linger_requests, cookie); if (!lreq) { dout("%s opcode %d cookie %llu dne\n", __func__, opcode, cookie); goto out_unlock_osdc; } mutex_lock(&lreq->lock); dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__, opcode, cookie, lreq, lreq->is_watch); if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { if (!lreq->last_error) { lreq->last_error = -ENOTCONN; queue_watch_error(lreq); } } else if (!lreq->is_watch) { /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */ if (lreq->notify_id && lreq->notify_id != notify_id) { dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq, lreq->notify_id, notify_id); } else if (!completion_done(&lreq->notify_finish_wait)) { struct ceph_msg_data *data = msg->num_data_items ? &msg->data[0] : NULL; if (data) { if (lreq->preply_pages) { WARN_ON(data->type != CEPH_MSG_DATA_PAGES); *lreq->preply_pages = data->pages; *lreq->preply_len = data->length; data->own_pages = false; } } lreq->notify_finish_error = return_code; complete_all(&lreq->notify_finish_wait); } } else { /* CEPH_WATCH_EVENT_NOTIFY */ lwork = lwork_alloc(lreq, do_watch_notify); if (!lwork) { pr_err("failed to allocate notify-lwork\n"); goto out_unlock_lreq; } lwork->notify.notify_id = notify_id; lwork->notify.notifier_id = notifier_id; lwork->notify.payload = payload; lwork->notify.payload_len = payload_len; lwork->notify.msg = ceph_msg_get(msg); lwork_queue(lwork); } out_unlock_lreq: mutex_unlock(&lreq->lock); out_unlock_osdc: up_read(&osdc->lock); return; bad: pr_err("osdc handle_watch_notify corrupt msg\n"); } /* * Register request, send initial attempt. */ void ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { down_read(&osdc->lock); submit_request(req, false); up_read(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_start_request); /* * Unregister request. If @req was registered, it isn't completed: * r_result isn't set and __complete_request() isn't invoked. * * If @req wasn't registered, this call may have raced with * handle_reply(), in which case r_result would already be set and * __complete_request() would be getting invoked, possibly even * concurrently with this call. */ void ceph_osdc_cancel_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; down_write(&osdc->lock); if (req->r_osd) cancel_request(req); up_write(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_cancel_request); /* * @timeout: in jiffies, 0 means "wait forever" */ static int wait_request_timeout(struct ceph_osd_request *req, unsigned long timeout) { long left; dout("%s req %p tid %llu\n", __func__, req, req->r_tid); left = wait_for_completion_killable_timeout(&req->r_completion, ceph_timeout_jiffies(timeout)); if (left <= 0) { left = left ?: -ETIMEDOUT; ceph_osdc_cancel_request(req); } else { left = req->r_result; /* completed */ } return left; } /* * wait for a request to complete */ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { return wait_request_timeout(req, 0); } EXPORT_SYMBOL(ceph_osdc_wait_request); /* * sync - wait for all in-flight requests to flush. avoid starvation. */ void ceph_osdc_sync(struct ceph_osd_client *osdc) { struct rb_node *n, *p; u64 last_tid = atomic64_read(&osdc->last_tid); again: down_read(&osdc->lock); for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); mutex_lock(&osd->lock); for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { struct ceph_osd_request *req = rb_entry(p, struct ceph_osd_request, r_node); if (req->r_tid > last_tid) break; if (!(req->r_flags & CEPH_OSD_FLAG_WRITE)) continue; ceph_osdc_get_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } mutex_unlock(&osd->lock); } up_read(&osdc->lock); dout("%s done last_tid %llu\n", __func__, last_tid); } EXPORT_SYMBOL(ceph_osdc_sync); /* * Returns a handle, caller owns a ref. */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, rados_watchcb2_t wcb, rados_watcherrcb_t errcb, void *data) { struct ceph_osd_linger_request *lreq; int ret; lreq = linger_alloc(osdc); if (!lreq) return ERR_PTR(-ENOMEM); lreq->is_watch = true; lreq->wcb = wcb; lreq->errcb = errcb; lreq->data = data; lreq->watch_valid_thru = jiffies; ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (ret) { linger_cancel(lreq); goto err_put_lreq; } return lreq; err_put_lreq: linger_put(lreq); return ERR_PTR(ret); } EXPORT_SYMBOL(ceph_osdc_watch); /* * Releases a ref. * * Times out after mount_timeout to preserve rbd unmap behaviour * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap * with mount_timeout"). */ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, struct ceph_osd_linger_request *lreq) { struct ceph_options *opts = osdc->client->options; struct ceph_osd_request *req; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&req->r_mtime); osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH, lreq->linger_id, 0); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); linger_cancel(lreq); linger_put(lreq); ret = wait_request_timeout(req, opts->mount_timeout); out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_unwatch); static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, u64 notify_id, u64 cookie, void *payload, u32 payload_len) { struct ceph_osd_req_op *op; struct ceph_pagelist *pl; int ret; op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; ret = ceph_pagelist_encode_64(pl, notify_id); ret |= ceph_pagelist_encode_64(pl, cookie); if (payload) { ret |= ceph_pagelist_encode_32(pl, payload_len); ret |= ceph_pagelist_append(pl, payload, payload_len); } else { ret |= ceph_pagelist_encode_32(pl, 0); } if (ret) { ceph_pagelist_release(pl); return -ENOMEM; } ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl); op->indata_len = pl->length; return 0; } int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, u64 notify_id, u64 cookie, void *payload, u32 payload_len) { struct ceph_osd_request *req; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, payload_len); if (ret) goto out_put_req; ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_notify_ack); /* * @timeout: in seconds * * @preply_{pages,len} are initialized both on success and error. * The caller is responsible for: * * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)) */ int ceph_osdc_notify(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, void *payload, u32 payload_len, u32 timeout, struct page ***preply_pages, size_t *preply_len) { struct ceph_osd_linger_request *lreq; int ret; WARN_ON(!timeout); if (preply_pages) { *preply_pages = NULL; *preply_len = 0; } lreq = linger_alloc(osdc); if (!lreq) return -ENOMEM; lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO); if (!lreq->request_pl) { ret = -ENOMEM; goto out_put_lreq; } ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */ ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout); ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len); ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len); if (ret) { ret = -ENOMEM; goto out_put_lreq; } /* for notify_id */ lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(lreq->notify_id_pages)) { ret = PTR_ERR(lreq->notify_id_pages); lreq->notify_id_pages = NULL; goto out_put_lreq; } lreq->preply_pages = preply_pages; lreq->preply_len = preply_len; ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_READ; linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) ret = linger_notify_finish_wait(lreq, msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); linger_cancel(lreq); out_put_lreq: linger_put(lreq); return ret; } EXPORT_SYMBOL(ceph_osdc_notify); static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) { u8 struct_v; u32 struct_len; int ret; ret = ceph_start_decoding(p, end, 2, "watch_item_t", &struct_v, &struct_len); if (ret) goto bad; ret = -EINVAL; ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad); ceph_decode_64_safe(p, end, item->cookie, bad); ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */ if (struct_v >= 2) { ret = ceph_decode_entity_addr(p, end, &item->addr); if (ret) goto bad; } else { ret = 0; } dout("%s %s%llu cookie %llu addr %s\n", __func__, ENTITY_NAME(item->name), item->cookie, ceph_pr_addr(&item->addr)); bad: return ret; } static int decode_watchers(void **p, void *end, struct ceph_watch_item **watchers, u32 *num_watchers) { u8 struct_v; u32 struct_len; int i; int ret; ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t", &struct_v, &struct_len); if (ret) return ret; *num_watchers = ceph_decode_32(p); *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO); if (!*watchers) return -ENOMEM; for (i = 0; i < *num_watchers; i++) { ret = decode_watcher(p, end, *watchers + i); if (ret) { kfree(*watchers); return ret; } } return 0; } /* * On success, the caller is responsible for: * * kfree(watchers); */ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, struct ceph_watch_item **watchers, u32 *num_watchers) { struct ceph_osd_request *req; struct page **pages; int ret; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out_put_req; } osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0); ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers, response_data), pages, PAGE_SIZE, 0, false, true); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { void *p = page_address(pages[0]); void *const end = p + req->r_ops[0].outdata_len; ret = decode_watchers(&p, end, watchers, num_watchers); } out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_list_watchers); /* * Call all pending notify callbacks - for use after a watch is * unregistered, to make sure no more callbacks for it will be invoked */ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) { dout("%s osdc %p\n", __func__, osdc); flush_workqueue(osdc->notify_wq); } EXPORT_SYMBOL(ceph_osdc_flush_notifies); void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) { down_read(&osdc->lock); maybe_request_map(osdc); up_read(&osdc->lock); } EXPORT_SYMBOL(ceph_osdc_maybe_request_map); /* * Execute an OSD class method on an object. * * @flags: CEPH_OSD_FLAG_* * @resp_len: in/out param for reply length */ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, const char *class, const char *method, unsigned int flags, struct page *req_page, size_t req_len, struct page **resp_pages, size_t *resp_len) { struct ceph_osd_request *req; int ret; if (req_len > PAGE_SIZE) return -E2BIG; req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; ceph_oid_copy(&req->r_base_oid, oid); ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = flags; ret = osd_req_op_cls_init(req, 0, class, method); if (ret) goto out_put_req; if (req_page) osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 0, false, false); if (resp_pages) osd_req_op_cls_response_data_pages(req, 0, resp_pages, *resp_len, 0, false, false); ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { ret = req->r_ops[0].rval; if (resp_pages) *resp_len = req->r_ops[0].outdata_len; } out_put_req: ceph_osdc_put_request(req); return ret; } EXPORT_SYMBOL(ceph_osdc_call); /* * reset all osd connections */ void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) { struct rb_node *n; down_write(&osdc->lock); for (n = rb_first(&osdc->osds); n; ) { struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); n = rb_next(n); if (!reopen_osd(osd)) kick_osd_requests(osd); } up_write(&osdc->lock); } /* * init, shutdown */ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) { int err; dout("init\n"); osdc->client = client; init_rwsem(&osdc->lock); osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); spin_lock_init(&osdc->osd_lru_lock); osd_init(&osdc->homeless_osd); osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; osdc->last_linger_id = CEPH_LINGER_ID_START; osdc->linger_requests = RB_ROOT; osdc->map_checks = RB_ROOT; osdc->linger_map_checks = RB_ROOT; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); err = -ENOMEM; osdc->osdmap = ceph_osdmap_alloc(); if (!osdc->osdmap) goto out; osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); if (!osdc->req_mempool) goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op_reply"); if (err < 0) goto out_msgpool; err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) goto out_msgpool_reply; osdc->completion_wq = create_singlethread_workqueue("ceph-completion"); if (!osdc->completion_wq) goto out_notify_wq; schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl)); return 0; out_notify_wq: destroy_workqueue(osdc->notify_wq); out_msgpool_reply: ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); out_map: ceph_osdmap_destroy(osdc->osdmap); out: return err; } void ceph_osdc_stop(struct ceph_osd_client *osdc) { destroy_workqueue(osdc->completion_wq); destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); down_write(&osdc->lock); while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd, o_node); close_osd(osd); } up_write(&osdc->lock); WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1); osd_cleanup(&osdc->homeless_osd); WARN_ON(!list_empty(&osdc->osd_lru)); WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks)); WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks)); WARN_ON(atomic_read(&osdc->num_requests)); WARN_ON(atomic_read(&osdc->num_homeless)); ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } int osd_req_op_copy_from_init(struct ceph_osd_request *req, u64 src_snapid, u64 src_version, struct ceph_object_id *src_oid, struct ceph_object_locator *src_oloc, u32 src_fadvise_flags, u32 dst_fadvise_flags, u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_req_op *op; struct page **pages; void *p, *end; pages = ceph_alloc_page_vector(1, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; op->copy_from.src_fadvise_flags = src_fadvise_flags; p = page_address(pages[0]); end = p + PAGE_SIZE; ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); encode_oloc(&p, end, src_oloc); ceph_encode_32(&p, truncate_seq); ceph_encode_64(&p, truncate_size); op->indata_len = PAGE_SIZE - (end - p); ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, op->indata_len, 0, false, true); return 0; } EXPORT_SYMBOL(osd_req_op_copy_from_init); int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); BUG_ON(ceph_osd_request_cache); ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, 0, 0, NULL); return ceph_osd_request_cache ? 0 : -ENOMEM; } void ceph_osdc_cleanup(void) { BUG_ON(!ceph_osd_request_cache); kmem_cache_destroy(ceph_osd_request_cache); ceph_osd_request_cache = NULL; } /* * handle incoming message */ static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(msg->hdr.type); switch (type) { case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: handle_reply(osd, msg); break; case CEPH_MSG_OSD_BACKOFF: handle_backoff(osd, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); break; default: pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } ceph_msg_put(msg); } /* How much sparse data was requested? */ static u64 sparse_data_requested(struct ceph_osd_request *req) { u64 len = 0; if (req->r_flags & CEPH_OSD_FLAG_READ) { int i; for (i = 0; i < req->r_num_ops; ++i) { struct ceph_osd_req_op *op = &req->r_ops[i]; if (op->op == CEPH_OSD_OP_SPARSE_READ) len += op->extent.length; } } return len; } /* * Lookup and return message for incoming reply. Don't try to do * anything about a larger than preallocated data portion of the * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m = NULL; struct ceph_osd_request *req; int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid = le64_to_cpu(hdr->tid); u64 srlen; down_read(&osdc->lock); if (!osd_registered(osd)) { dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd); *skip = 1; goto out_unlock_osdc; } WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num)); mutex_lock(&osd->lock); req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown, skipping\n", __func__, osd->o_osd, tid); *skip = 1; goto out_unlock_session; } ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", __func__, osd->o_osd, req->r_tid, front_len, req->r_reply->front_alloc_len); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) goto out_unlock_session; ceph_msg_put(req->r_reply); req->r_reply = m; } srlen = sparse_data_requested(req); if (!srlen && data_len > req->r_reply->data_length) { pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", __func__, osd->o_osd, req->r_tid, data_len, req->r_reply->data_length); m = NULL; *skip = 1; goto out_unlock_session; } m = ceph_msg_get(req->r_reply); m->sparse_read_total = srlen; dout("get_reply tid %lld %p\n", tid, m); out_unlock_session: mutex_unlock(&osd->lock); out_unlock_osdc: up_read(&osdc->lock); return m; } static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) { struct ceph_msg *m; int type = le16_to_cpu(hdr->type); u32 front_len = le32_to_cpu(hdr->front_len); u32 data_len = le32_to_cpu(hdr->data_len); m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false); if (!m) return NULL; if (data_len) { struct page **pages; pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), GFP_NOIO); if (IS_ERR(pages)) { ceph_msg_put(m); return NULL; } ceph_msg_data_add_pages(m, pages, data_len, 0, true); } return m; } static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_BACKOFF: case CEPH_MSG_WATCH_NOTIFY: return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__, osd->o_osd, type); *skip = 1; return NULL; } } /* * Wrappers to refcount containing ceph_osd struct */ static struct ceph_connection *osd_get_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; if (get_osd(osd)) return con; return NULL; } static void osd_put_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; put_osd(osd); } /* * authentication */ /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ static struct ceph_auth_handshake * osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, force_new, proto, NULL, NULL); if (ret) return ERR_PTR(ret); return auth; } static int osd_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer, challenge_buf, challenge_buf_len); } static int osd_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, NULL, NULL, NULL, NULL); } static int osd_invalidate_authorizer(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } static int osd_get_auth_request(struct ceph_connection *con, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int osd_handle_auth_reply_more(struct ceph_connection *con, void *reply, int reply_len, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; int ret; ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int osd_handle_auth_done(struct ceph_connection *con, u64 global_id, void *reply, int reply_len, u8 *session_key, int *session_key_len, u8 *con_secret, int *con_secret_len) { struct ceph_osd *o = con->private; struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, session_key, session_key_len, con_secret, con_secret_len); } static int osd_handle_auth_bad_method(struct ceph_connection *con, int used_proto, int result, const int *allowed_protos, int proto_cnt, const int *allowed_modes, int mode_cnt) { struct ceph_osd *o = con->private; struct ceph_mon_client *monc = &o->o_osdc->client->monc; int ret; if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, used_proto, result, allowed_protos, proto_cnt, allowed_modes, mode_cnt)) { ret = ceph_monc_validate_auth(monc); if (ret) return ret; } return -EACCES; } static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); if (type == CEPH_MSG_OSD_OP) encode_request_finish(msg); } static int osd_sign_message(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_sign_message(auth, msg); } static int osd_check_message_signature(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; struct ceph_auth_handshake *auth = &o->o_auth; return ceph_auth_check_message_signature(auth, msg); } static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len, bool zero) { while (len) { struct page *page; size_t poff, plen; page = ceph_msg_data_next(cursor, &poff, &plen); if (plen > len) plen = len; if (zero) zero_user_segment(page, poff, poff + plen); len -= plen; ceph_msg_data_advance(cursor, plen); } } static int prep_next_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor) { struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; struct ceph_osd_request *req; struct ceph_osd_req_op *op; spin_lock(&o->o_requests_lock); req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid)); if (!req) { spin_unlock(&o->o_requests_lock); return -EBADR; } if (o->o_sparse_op_idx < 0) { dout("%s: [%d] starting new sparse read req\n", __func__, o->o_osd); } else { u64 end; op = &req->r_ops[o->o_sparse_op_idx]; WARN_ON_ONCE(op->extent.sparse_ext); /* hand back buffer we took earlier */ op->extent.sparse_ext = sr->sr_extent; sr->sr_extent = NULL; op->extent.sparse_ext_cnt = sr->sr_count; sr->sr_ext_len = 0; dout("%s: [%d] completed extent array len %d cursor->resid %zd\n", __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid); /* Advance to end of data for this operation */ end = ceph_sparse_ext_map_end(op); if (end < sr->sr_req_len) advance_cursor(cursor, sr->sr_req_len - end, false); } ceph_init_sparse_read(sr); /* find next op in this request (if any) */ while (++o->o_sparse_op_idx < req->r_num_ops) { op = &req->r_ops[o->o_sparse_op_idx]; if (op->op == CEPH_OSD_OP_SPARSE_READ) goto found; } /* reset for next sparse read request */ spin_unlock(&o->o_requests_lock); o->o_sparse_op_idx = -1; return 0; found: sr->sr_req_off = op->extent.offset; sr->sr_req_len = op->extent.length; sr->sr_pos = sr->sr_req_off; dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__, o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len); /* hand off request's sparse extent map buffer */ sr->sr_ext_len = op->extent.sparse_ext_cnt; op->extent.sparse_ext_cnt = 0; sr->sr_extent = op->extent.sparse_ext; op->extent.sparse_ext = NULL; spin_unlock(&o->o_requests_lock); return 1; } #ifdef __BIG_ENDIAN static inline void convert_extent_map(struct ceph_sparse_read *sr) { int i; for (i = 0; i < sr->sr_count; i++) { struct ceph_sparse_extent *ext = &sr->sr_extent[i]; ext->off = le64_to_cpu((__force __le64)ext->off); ext->len = le64_to_cpu((__force __le64)ext->len); } } #else static inline void convert_extent_map(struct ceph_sparse_read *sr) { } #endif static int osd_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor, char **pbuf) { struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; u32 count = sr->sr_count; u64 eoff, elen, len = 0; int i, ret; switch (sr->sr_state) { case CEPH_SPARSE_READ_HDR: next_op: ret = prep_next_sparse_read(con, cursor); if (ret <= 0) return ret; /* number of extents */ ret = sizeof(sr->sr_count); *pbuf = (char *)&sr->sr_count; sr->sr_state = CEPH_SPARSE_READ_EXTENTS; break; case CEPH_SPARSE_READ_EXTENTS: /* Convert sr_count to host-endian */ count = le32_to_cpu((__force __le32)sr->sr_count); sr->sr_count = count; dout("[%d] got %u extents\n", o->o_osd, count); if (count > 0) { if (!sr->sr_extent || count > sr->sr_ext_len) { /* no extent array provided, or too short */ kfree(sr->sr_extent); sr->sr_extent = kmalloc_array(count, sizeof(*sr->sr_extent), GFP_NOIO); if (!sr->sr_extent) { pr_err("%s: failed to allocate %u extents\n", __func__, count); return -ENOMEM; } sr->sr_ext_len = count; } ret = count * sizeof(*sr->sr_extent); *pbuf = (char *)sr->sr_extent; sr->sr_state = CEPH_SPARSE_READ_DATA_LEN; break; } /* No extents? Read data len */ fallthrough; case CEPH_SPARSE_READ_DATA_LEN: convert_extent_map(sr); ret = sizeof(sr->sr_datalen); *pbuf = (char *)&sr->sr_datalen; sr->sr_state = CEPH_SPARSE_READ_DATA_PRE; break; case CEPH_SPARSE_READ_DATA_PRE: /* Convert sr_datalen to host-endian */ sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen); for (i = 0; i < count; i++) len += sr->sr_extent[i].len; if (sr->sr_datalen != len) { pr_warn_ratelimited("data len %u != extent len %llu\n", sr->sr_datalen, len); return -EREMOTEIO; } sr->sr_state = CEPH_SPARSE_READ_DATA; fallthrough; case CEPH_SPARSE_READ_DATA: if (sr->sr_index >= count) { sr->sr_state = CEPH_SPARSE_READ_HDR; goto next_op; } eoff = sr->sr_extent[sr->sr_index].off; elen = sr->sr_extent[sr->sr_index].len; dout("[%d] ext %d off 0x%llx len 0x%llx\n", o->o_osd, sr->sr_index, eoff, elen); if (elen > INT_MAX) { dout("Sparse read extent length too long (0x%llx)\n", elen); return -EREMOTEIO; } /* zero out anything from sr_pos to start of extent */ if (sr->sr_pos < eoff) advance_cursor(cursor, eoff - sr->sr_pos, true); /* Set position to end of extent */ sr->sr_pos = eoff + elen; /* send back the new length and nullify the ptr */ cursor->sr_resid = elen; ret = elen; *pbuf = NULL; /* Bump the array index */ ++sr->sr_index; break; } return ret; } static const struct ceph_connection_operations osd_con_ops = { .get = osd_get_con, .put = osd_put_con, .sparse_read = osd_sparse_read, .alloc_msg = osd_alloc_msg, .dispatch = osd_dispatch, .fault = osd_fault, .reencode_message = osd_reencode_message, .get_authorizer = osd_get_authorizer, .add_authorizer_challenge = osd_add_authorizer_challenge, .verify_authorizer_reply = osd_verify_authorizer_reply, .invalidate_authorizer = osd_invalidate_authorizer, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, .get_auth_request = osd_get_auth_request, .handle_auth_reply_more = osd_handle_auth_reply_more, .handle_auth_done = osd_handle_auth_done, .handle_auth_bad_method = osd_handle_auth_bad_method, };
109 4 40 229 21 1 86 249 249 146 107 107 107 8 107 107 58 2 122 11 164 166 8 14 1 18 163 129 308 6 1 440 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bcachefs #if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) #include <linux/tracepoint.h> #define TRACE_BPOS_entries(name) \ __field(u64, name##_inode ) \ __field(u64, name##_offset ) \ __field(u32, name##_snapshot ) #define TRACE_BPOS_assign(dst, src) \ __entry->dst##_inode = (src).inode; \ __entry->dst##_offset = (src).offset; \ __entry->dst##_snapshot = (src).snapshot DECLARE_EVENT_CLASS(bpos, TP_PROTO(const struct bpos *p), TP_ARGS(p), TP_STRUCT__entry( TRACE_BPOS_entries(p) ), TP_fast_assign( TRACE_BPOS_assign(p, *p); ), TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) ); DECLARE_EVENT_CLASS(fs_str, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str), TP_STRUCT__entry( __field(dev_t, dev ) __string(str, str ) ), TP_fast_assign( __entry->dev = c->dev; __assign_str(str); ), TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); DECLARE_EVENT_CLASS(trans_str, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), TP_ARGS(trans, caller_ip, str), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __string(str, str ) ), TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __assign_str(str); ), TP_printk("%d,%d %s %pS %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) ); DECLARE_EVENT_CLASS(trans_str_nocaller, TP_PROTO(struct btree_trans *trans, const char *str), TP_ARGS(trans, str), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, trans_fn, 32 ) __string(str, str ) ), TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __assign_str(str); ), TP_printk("%d,%d %s %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, __get_str(str)) ); DECLARE_EVENT_CLASS(btree_node_nofs, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b), TP_STRUCT__entry( __field(dev_t, dev ) __field(u8, level ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) ), TP_fast_assign( __entry->dev = c->dev; __entry->level = b->c.level; __entry->btree_id = b->c.btree_id; TRACE_BPOS_assign(pos, b->key.k.p); ), TP_printk("%d,%d %u %s %llu:%llu:%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->level, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); DECLARE_EVENT_CLASS(btree_node, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, trans_fn, 32 ) __field(u8, level ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) ), TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->level = b->c.level; __entry->btree_id = b->c.btree_id; TRACE_BPOS_assign(pos, b->key.k.p); ), TP_printk("%d,%d %s %u %s %llu:%llu:%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, __entry->level, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), TP_STRUCT__entry( __field(dev_t, dev ) ), TP_fast_assign( __entry->dev = c->dev; ), TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); DECLARE_EVENT_CLASS(btree_trans, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, trans_fn, 32 ) ), TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ), TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) ); DECLARE_EVENT_CLASS(bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field(dev_t, dev ) __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) ), TP_fast_assign( __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector) ); /* disk_accounting.c */ TRACE_EVENT(accounting_mem_insert, TP_PROTO(struct bch_fs *c, const char *acc), TP_ARGS(c, acc), TP_STRUCT__entry( __field(dev_t, dev ) __field(unsigned, new_nr ) __string(acc, acc ) ), TP_fast_assign( __entry->dev = c->dev; __entry->new_nr = c->accounting.k.nr; __assign_str(acc); ), TP_printk("%d,%d entries %u added %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->new_nr, __get_str(acc)) ); /* fs.c: */ TRACE_EVENT(bch2_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait = wait; ), TP_printk("dev %d,%d wait %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait) ); /* fs-io.c: */ TRACE_EVENT(bch2_fsync, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) ), TP_fast_assign( struct dentry *dentry = file->f_path.dentry; __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->parent = d_inode(dentry->d_parent)->i_ino; __entry->datasync = datasync; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); /* super-io.c: */ TRACE_EVENT(write_super, TP_PROTO(struct bch_fs *c, unsigned long ip), TP_ARGS(c, ip), TP_STRUCT__entry( __field(dev_t, dev ) __field(unsigned long, ip ) ), TP_fast_assign( __entry->dev = c->dev; __entry->ip = ip; ), TP_printk("%d,%d for %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (void *) __entry->ip) ); /* io.c: */ DEFINE_EVENT(bio, read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); TRACE_EVENT(read_nopromote, TP_PROTO(struct bch_fs *c, int ret), TP_ARGS(c, ret), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, ret, 32 ) ), TP_fast_assign( __entry->dev = c->dev; strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); ), TP_printk("%d,%d ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ret) ); DEFINE_EVENT(bio, read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); DEFINE_EVENT(bio, read_split, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); DEFINE_EVENT(bio, read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); DEFINE_EVENT(bio, read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /* Journal */ DEFINE_EVENT(bch_fs, journal_full, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); DEFINE_EVENT(fs_str, journal_entry_full, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, journal_entry_close, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), TP_STRUCT__entry( __field(dev_t, dev ) __field(bool, direct ) __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) __field(u64, btree_key_cache_total ) ), TP_fast_assign( __entry->dev = c->dev; __entry->direct = direct; __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, __entry->btree_key_cache_total) ); TRACE_EVENT(journal_reclaim_finish, TP_PROTO(struct bch_fs *c, u64 nr_flushed), TP_ARGS(c, nr_flushed), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, nr_flushed ) ), TP_fast_assign( __entry->dev = c->dev; __entry->nr_flushed = nr_flushed; ), TP_printk("%d,%d flushed %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_flushed) ); /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, TP_PROTO(const struct bpos *p), TP_ARGS(p) ); /* Btree cache: */ TRACE_EVENT(btree_cache_scan, TP_PROTO(long nr_to_scan, long can_free, long ret), TP_ARGS(nr_to_scan, can_free, ret), TP_STRUCT__entry( __field(long, nr_to_scan ) __field(long, can_free ) __field(long, ret ) ), TP_fast_assign( __entry->nr_to_scan = nr_to_scan; __entry->can_free = can_free; __entry->ret = ret; ), TP_printk("scanned for %li nodes, can free %li, ret %li", __entry->nr_to_scan, __entry->can_free, __entry->ret) ); DEFINE_EVENT(btree_node_nofs, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans) ); /* Btree */ DEFINE_EVENT(btree_node, btree_node_read, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); TRACE_EVENT(btree_node_write, TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), TP_ARGS(b, bytes, sectors), TP_STRUCT__entry( __field(enum btree_node_type, type) __field(unsigned, bytes ) __field(unsigned, sectors ) ), TP_fast_assign( __entry->type = btree_node_type(b); __entry->bytes = bytes; __entry->sectors = sectors; ), TP_printk("bkey type %u bytes %u sectors %u", __entry->type , __entry->bytes, __entry->sectors) ); DEFINE_EVENT(btree_node, btree_node_alloc, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_free, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); TRACE_EVENT(btree_reserve_get_fail, TP_PROTO(const char *trans_fn, unsigned long caller_ip, size_t required, int ret), TP_ARGS(trans_fn, caller_ip, required, ret), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(size_t, required ) __array(char, ret, 32 ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->required = required; strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); ), TP_printk("%s %pS required %zu ret %s", __entry->trans_fn, (void *) __entry->caller_ip, __entry->required, __entry->ret) ); DEFINE_EVENT(btree_node, btree_node_compact, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_merge, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_split, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_rewrite, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_set_root, TP_PROTO(struct btree_trans *trans, struct btree *b), TP_ARGS(trans, b) ); TRACE_EVENT(btree_path_relock_fail, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned level), TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) __field(u8, path_idx) TRACE_BPOS_entries(pos) __array(char, node, 24 ) __field(u8, self_read_count ) __field(u8, self_intent_count) __field(u8, read_count ) __field(u8, intent_count ) __field(u32, iter_lock_seq ) __field(u32, node_lock_seq ) ), TP_fast_assign( struct btree *b = btree_path_node(path, level); struct six_lock_count c; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->level = level; __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; if (IS_ERR(b)) { strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); } else { c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_intent]; scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); } __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) ? six_lock_seq(&path->l[level].b->c.lock) : 0; ), TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->level, __entry->node, __entry->self_read_count, __entry->self_intent_count, __entry->read_count, __entry->intent_count, __entry->iter_lock_seq, __entry->node_lock_seq) ); TRACE_EVENT(btree_path_upgrade_fail, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned level), TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) __field(u8, path_idx) TRACE_BPOS_entries(pos) __field(u8, locked ) __field(u8, self_read_count ) __field(u8, self_intent_count) __field(u8, read_count ) __field(u8, intent_count ) __field(u32, iter_lock_seq ) __field(u32, node_lock_seq ) ), TP_fast_assign( struct six_lock_count c; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->level = level; __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); __entry->locked = btree_node_locked(path, level); c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_intent]; __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) ? six_lock_seq(&path->l[level].b->c.lock) : 0; ), TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->level, __entry->locked, __entry->self_read_count, __entry->self_intent_count, __entry->read_count, __entry->intent_count, __entry->iter_lock_seq, __entry->node_lock_seq) ); /* Garbage collection */ DEFINE_EVENT(bch_fs, gc_gens_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); DEFINE_EVENT(bch_fs, gc_gens_end, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); /* Allocator */ DEFINE_EVENT(fs_str, bucket_alloc, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, bucket_alloc_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); TRACE_EVENT(discard_buckets, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, seen ) __field(u64, open ) __field(u64, need_journal_commit ) __field(u64, discarded ) __array(char, err, 16 ) ), TP_fast_assign( __entry->dev = c->dev; __entry->seen = seen; __entry->open = open; __entry->need_journal_commit = need_journal_commit; __entry->discarded = discarded; strscpy(__entry->err, err, sizeof(__entry->err)); ), TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->seen, __entry->open, __entry->need_journal_commit, __entry->discarded, __entry->err) ); TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), TP_STRUCT__entry( __field(dev_t, dev ) __field(u32, dev_idx ) __field(u32, sectors ) __field(u64, bucket ) ), TP_fast_assign( __entry->dev = c->dev; __entry->dev_idx = dev; __entry->sectors = sectors; __entry->bucket = bucket; ), TP_printk("%d:%d invalidated %u:%llu cached sectors %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dev_idx, __entry->bucket, __entry->sectors) ); /* Moving IO */ TRACE_EVENT(bucket_evacuate, TP_PROTO(struct bch_fs *c, struct bpos *bucket), TP_ARGS(c, bucket), TP_STRUCT__entry( __field(dev_t, dev ) __field(u32, dev_idx ) __field(u64, bucket ) ), TP_fast_assign( __entry->dev = c->dev; __entry->dev_idx = bucket->inode; __entry->bucket = bucket->offset; ), TP_printk("%d:%d %u:%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dev_idx, __entry->bucket) ); DEFINE_EVENT(fs_str, move_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_read, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_write, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_finish, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); TRACE_EVENT(move_data, TP_PROTO(struct bch_fs *c, struct bch_move_stats *stats), TP_ARGS(c, stats), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, keys_moved ) __field(u64, keys_raced ) __field(u64, sectors_seen ) __field(u64, sectors_moved ) __field(u64, sectors_raced ) ), TP_fast_assign( __entry->dev = c->dev; __entry->keys_moved = atomic64_read(&stats->keys_moved); __entry->keys_raced = atomic64_read(&stats->keys_raced); __entry->sectors_seen = atomic64_read(&stats->sectors_seen); __entry->sectors_moved = atomic64_read(&stats->sectors_moved); __entry->sectors_raced = atomic64_read(&stats->sectors_raced); ), TP_printk("%d,%d keys moved %llu raced %llu" "sectors seen %llu moved %llu raced %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->keys_moved, __entry->keys_raced, __entry->sectors_seen, __entry->sectors_moved, __entry->sectors_raced) ); TRACE_EVENT(evacuate_bucket, TP_PROTO(struct bch_fs *c, struct bpos *bucket, unsigned sectors, unsigned bucket_size, int ret), TP_ARGS(c, bucket, sectors, bucket_size, ret), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, member ) __field(u64, bucket ) __field(u32, sectors ) __field(u32, bucket_size ) __field(int, ret ) ), TP_fast_assign( __entry->dev = c->dev; __entry->member = bucket->inode; __entry->bucket = bucket->offset; __entry->sectors = sectors; __entry->bucket_size = bucket_size; __entry->ret = ret; ), TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->member, __entry->bucket, __entry->sectors, __entry->bucket_size, __entry->ret) ); TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 buckets, u64 sectors_seen, u64 sectors_moved), TP_ARGS(c, buckets, sectors_seen, sectors_moved), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, buckets ) __field(u64, sectors_seen ) __field(u64, sectors_moved ) ), TP_fast_assign( __entry->dev = c->dev; __entry->buckets = buckets; __entry->sectors_seen = sectors_seen; __entry->sectors_moved = sectors_moved; ), TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->buckets, __entry->sectors_seen, __entry->sectors_moved) ); TRACE_EVENT(copygc_wait, TP_PROTO(struct bch_fs *c, u64 wait_amount, u64 until), TP_ARGS(c, wait_amount, until), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, wait_amount ) __field(u64, until ) ), TP_fast_assign( __entry->dev = c->dev; __entry->wait_amount = wait_amount; __entry->until = until; ), TP_printk("%d,%u waiting for %llu sectors until %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait_amount, __entry->until) ); /* btree transactions: */ DECLARE_EVENT_CLASS(transaction_event, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; ), TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ); DEFINE_EVENT(transaction_event, transaction_commit, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(transaction_event, trans_restart_injected, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); TRACE_EVENT(trans_restart_split_race, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree *b), TP_ARGS(trans, caller_ip, b), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, level ) __field(u16, written ) __field(u16, blocks ) __field(u16, u64s_remaining ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", __entry->trans_fn, (void *) __entry->caller_ip, __entry->level, __entry->written, __entry->blocks, __entry->u64s_remaining) ); TRACE_EVENT(trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned long, key_cache_nr_keys ) __field(unsigned long, key_cache_nr_dirty ) __field(long, must_wait ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); ), TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", __entry->trans_fn, (void *) __entry->caller_ip, __entry->key_cache_nr_keys, __entry->key_cache_nr_dirty, __entry->must_wait) ); TRACE_EVENT(trans_restart_journal_preres_get, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, unsigned flags), TP_ARGS(trans, caller_ip, flags), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned, flags ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->flags = flags; ), TP_printk("%s %pS %x", __entry->trans_fn, (void *) __entry->caller_ip, __entry->flags) ); DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(transaction_event, trans_traverse_all, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(trans_str, trans_restart_too_many_iters, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *paths), TP_ARGS(trans, caller_ip, paths) ); DECLARE_EVENT_CLASS(transaction_restart_iter, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos) ), TP_printk("%s %pS btree %s pos %llu:%llu:%u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned old_locks_want, unsigned new_locks_want, struct get_locks_fail *f), TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, old_locks_want ) __field(u8, new_locks_want ) __field(u8, level ) __field(u32, path_seq ) __field(u32, node_seq ) TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->old_locks_want = old_locks_want; __entry->new_locks_want = new_locks_want; __entry->level = f->l; __entry->path_seq = path->l[f->l].lock_seq; __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; TRACE_BPOS_assign(pos, path->pos) ), TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->old_locks_want, __entry->new_locks_want, __entry->level, __entry->path_seq, __entry->node_seq) ); DEFINE_EVENT(trans_str, trans_restart_relock, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path), TP_ARGS(trans, caller_ip, path) ); DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, TP_PROTO(struct btree_trans *trans, const char *cycle), TP_ARGS(trans, cycle) ); DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); TRACE_EVENT(trans_restart_would_deadlock_write, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans), TP_STRUCT__entry( __array(char, trans_fn, 32 ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ), TP_printk("%s", __entry->trans_fn) ); TRACE_EVENT(trans_restart_mem_realloced, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, unsigned long bytes), TP_ARGS(trans, caller_ip, bytes), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned long, bytes ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->bytes = bytes; ), TP_printk("%s %pS bytes %lu", __entry->trans_fn, (void *) __entry->caller_ip, __entry->bytes) ); TRACE_EVENT(trans_restart_key_cache_key_realloced, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned old_u64s, unsigned new_u64s), TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(enum btree_id, btree_id ) TRACE_BPOS_entries(pos) __field(u32, old_u64s ) __field(u32, new_u64s ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos); __entry->old_u64s = old_u64s; __entry->new_u64s = new_u64s; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->old_u64s, __entry->new_u64s) ); DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned old_locks_want), TP_ARGS(trans, caller_ip, path, old_locks_want), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned, old_locks_want ) __field(unsigned, new_locks_want ) __field(unsigned, btree ) TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->old_locks_want = old_locks_want; __entry->new_locks_want = path->locks_want; __entry->btree = path->btree_id; TRACE_BPOS_assign(pos, path->pos); ), TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", __entry->trans_fn, (void *) __entry->caller_ip, __entry->old_locks_want, __entry->new_locks_want, bch2_btree_id_str(__entry->btree), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); TRACE_EVENT(key_cache_fill, TP_PROTO(struct btree_trans *trans, const char *key), TP_ARGS(trans, key), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __string(key, key ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __assign_str(key); ), TP_printk("%s %s", __entry->trans_fn, __get_str(key)) ); TRACE_EVENT(write_buffer_flush, TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), TP_ARGS(trans, nr, skipped, fast, size), TP_STRUCT__entry( __field(size_t, nr ) __field(size_t, skipped ) __field(size_t, fast ) __field(size_t, size ) ), TP_fast_assign( __entry->nr = nr; __entry->skipped = skipped; __entry->fast = fast; __entry->size = size; ), TP_printk("%zu/%zu skipped %zu fast %zu", __entry->nr, __entry->size, __entry->skipped, __entry->fast) ); TRACE_EVENT(write_buffer_flush_sync, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; ), TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ); TRACE_EVENT(write_buffer_flush_slowpath, TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), TP_ARGS(trans, slowpath, total), TP_STRUCT__entry( __field(size_t, slowpath ) __field(size_t, total ) ), TP_fast_assign( __entry->slowpath = slowpath; __entry->total = total; ), TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); TRACE_EVENT(write_buffer_maybe_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), TP_ARGS(trans, caller_ip, key), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __string(key, key ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __assign_str(key); ), TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) ); DEFINE_EVENT(fs_str, rebalance_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, data_update, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); TRACE_EVENT(error_downcast, TP_PROTO(int bch_err, int std_err, unsigned long ip), TP_ARGS(bch_err, std_err, ip), TP_STRUCT__entry( __array(char, bch_err, 32 ) __array(char, std_err, 32 ) __array(char, ip, 32 ) ), TP_fast_assign( strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); ), TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) ); #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS TRACE_EVENT(update_by_path, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_insert_entry *i, bool overwrite), TP_ARGS(trans, path, i, overwrite), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(btree_path_idx_t, path_idx ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) __field(u8, overwrite ) __field(btree_path_idx_t, update_idx ) __field(btree_path_idx_t, nr_updates ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->path_idx = path - trans->paths; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos); __entry->overwrite = overwrite; __entry->update_idx = i - trans->updates; __entry->nr_updates = trans->nr_updates; ), TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", __entry->trans_fn, __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->overwrite, __entry->update_idx, __entry->nr_updates) ); TRACE_EVENT(btree_path_lock, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b), TP_ARGS(trans, caller_ip, b), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) __array(char, node, 24 ) __field(u32, lock_seq ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = b->btree_id; __entry->level = b->level; scnprintf(__entry->node, sizeof(__entry->node), "%px", b); __entry->lock_seq = six_lock_seq(&b->lock); ), TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), __entry->level, __entry->node, __entry->lock_seq) ); DECLARE_EVENT_CLASS(btree_path_ev, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path), TP_STRUCT__entry( __field(u16, idx ) __field(u8, ref ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) ), TP_fast_assign( __entry->idx = path - trans->paths; __entry->ref = path->ref; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos); ), TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", __entry->idx, __entry->ref, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); DEFINE_EVENT(btree_path_ev, btree_path_get_ll, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path) ); DEFINE_EVENT(btree_path_ev, btree_path_put_ll, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path) ); DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path) ); TRACE_EVENT(btree_path_alloc, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) __field(u8, locks_want ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) ), TP_fast_assign( __entry->idx = path - trans->paths; __entry->locks_want = path->locks_want; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos); ), TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", __entry->idx, bch2_btree_id_str(__entry->btree_id), __entry->locks_want, __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); TRACE_EVENT(btree_path_get, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), TP_ARGS(trans, path, new_pos), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) __field(u8, ref ) __field(u8, preserve ) __field(u8, locks_want ) __field(u8, btree_id ) TRACE_BPOS_entries(old_pos) TRACE_BPOS_entries(new_pos) ), TP_fast_assign( __entry->idx = path - trans->paths; __entry->ref = path->ref; __entry->preserve = path->preserve; __entry->locks_want = path->locks_want; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(old_pos, path->pos); TRACE_BPOS_assign(new_pos, *new_pos); ), TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", __entry->idx, __entry->ref, __entry->preserve, bch2_btree_id_str(__entry->btree_id), __entry->locks_want, __entry->old_pos_inode, __entry->old_pos_offset, __entry->old_pos_snapshot, __entry->new_pos_inode, __entry->new_pos_offset, __entry->new_pos_snapshot) ); DECLARE_EVENT_CLASS(btree_path_clone, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), TP_ARGS(trans, path, new), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) __field(u8, new_idx ) __field(u8, btree_id ) __field(u8, ref ) __field(u8, preserve ) TRACE_BPOS_entries(pos) ), TP_fast_assign( __entry->idx = path - trans->paths; __entry->new_idx = new - trans->paths; __entry->btree_id = path->btree_id; __entry->ref = path->ref; __entry->preserve = path->preserve; TRACE_BPOS_assign(pos, path->pos); ), TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", __entry->idx, __entry->ref, __entry->preserve, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->new_idx) ); DEFINE_EVENT(btree_path_clone, btree_path_clone, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), TP_ARGS(trans, path, new) ); DEFINE_EVENT(btree_path_clone, btree_path_save_pos, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), TP_ARGS(trans, path, new) ); DECLARE_EVENT_CLASS(btree_path_traverse, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(btree_path_idx_t, idx ) __field(u8, ref ) __field(u8, preserve ) __field(u8, should_be_locked ) __field(u8, btree_id ) __field(u8, level ) TRACE_BPOS_entries(pos) __field(u8, locks_want ) __field(u8, nodes_locked ) __array(char, node0, 24 ) __array(char, node1, 24 ) __array(char, node2, 24 ) __array(char, node3, 24 ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->idx = path - trans->paths; __entry->ref = path->ref; __entry->preserve = path->preserve; __entry->btree_id = path->btree_id; __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); __entry->locks_want = path->locks_want; __entry->nodes_locked = path->nodes_locked; struct btree *b = path->l[0].b; if (IS_ERR(b)) strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); b = path->l[1].b; if (IS_ERR(b)) strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); b = path->l[2].b; if (IS_ERR(b)) strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); b = path->l[3].b; if (IS_ERR(b)) strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); ), TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" "locks %u %u %u %u node %s %s %s %s", __entry->trans_fn, __entry->idx, __entry->ref, __entry->preserve, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->level, __entry->locks_want, (__entry->nodes_locked >> 6) & 3, (__entry->nodes_locked >> 4) & 3, (__entry->nodes_locked >> 2) & 3, (__entry->nodes_locked >> 0) & 3, __entry->node3, __entry->node2, __entry->node1, __entry->node0) ); DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path) ); DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, TP_PROTO(struct btree_trans *trans, struct btree_path *path), TP_ARGS(trans, path) ); TRACE_EVENT(btree_path_set_pos, TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), TP_ARGS(trans, path, new_pos), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) __field(u8, ref ) __field(u8, preserve ) __field(u8, btree_id ) TRACE_BPOS_entries(old_pos) TRACE_BPOS_entries(new_pos) __field(u8, locks_want ) __field(u8, nodes_locked ) __array(char, node0, 24 ) __array(char, node1, 24 ) __array(char, node2, 24 ) __array(char, node3, 24 ) ), TP_fast_assign( __entry->idx = path - trans->paths; __entry->ref = path->ref; __entry->preserve = path->preserve; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(old_pos, path->pos); TRACE_BPOS_assign(new_pos, *new_pos); __entry->nodes_locked = path->nodes_locked; struct btree *b = path->l[0].b; if (IS_ERR(b)) strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); b = path->l[1].b; if (IS_ERR(b)) strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); b = path->l[2].b; if (IS_ERR(b)) strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); b = path->l[3].b; if (IS_ERR(b)) strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); else scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); ), TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" "locks %u %u %u %u node %s %s %s %s", __entry->idx, __entry->ref, __entry->preserve, bch2_btree_id_str(__entry->btree_id), __entry->old_pos_inode, __entry->old_pos_offset, __entry->old_pos_snapshot, __entry->new_pos_inode, __entry->new_pos_offset, __entry->new_pos_snapshot, (__entry->nodes_locked >> 6) & 3, (__entry->nodes_locked >> 4) & 3, (__entry->nodes_locked >> 2) & 3, (__entry->nodes_locked >> 0) & 3, __entry->node3, __entry->node2, __entry->node1, __entry->node0) ); TRACE_EVENT(btree_path_free, TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), TP_ARGS(trans, path, dup), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) __field(u8, preserve ) __field(u8, should_be_locked) __field(s8, dup ) __field(u8, dup_locked ) ), TP_fast_assign( __entry->idx = path; __entry->preserve = trans->paths[path].preserve; __entry->should_be_locked = trans->paths[path].should_be_locked; __entry->dup = dup ? dup - trans->paths : -1; __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; ), TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, __entry->preserve ? 'P' : ' ', __entry->should_be_locked ? 'S' : ' ', __entry->dup, __entry->dup_locked) ); TRACE_EVENT(btree_path_free_trans_begin, TP_PROTO(btree_path_idx_t path), TP_ARGS(path), TP_STRUCT__entry( __field(btree_path_idx_t, idx ) ), TP_fast_assign( __entry->idx = path; ), TP_printk(" path %3u", __entry->idx) ); #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ #ifndef _TRACE_BCACHEFS_H static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, struct btree_insert_entry *i, bool overwrite) {} static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} #endif #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ #define _TRACE_BCACHEFS_H #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../fs/bcachefs #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for non-atomic * bit operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #include <linux/instrumented.h> /** * ___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___set_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___set_bit(nr, addr); } /** * ___clear_bit - Clears a bit in memory * @nr: the bit to clear * @addr: the address to start counting from * * Unlike clear_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___clear_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit(nr, addr); } /** * ___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static __always_inline void ___change_bit(unsigned long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___change_bit(nr, addr); } static __always_inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr) { if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) { /* * We treat non-atomic read-write bitops a little more special. * Given the operations here only modify a single bit, assuming * non-atomicity of the writer is sufficient may be reasonable * for certain usage (and follows the permissible nature of the * assume-plain-writes-atomic rule): * 1. report read-modify-write races -> check read; * 2. do not report races with marked readers, but do report * races with unmarked readers -> check "atomic" write. */ kcsan_check_read(addr + BIT_WORD(nr), sizeof(long)); /* * Use generic write instrumentation, in case other sanitizers * or tools are enabled alongside KCSAN. */ instrument_write(addr + BIT_WORD(nr), sizeof(long)); } else { instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); } } /** * ___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); } /** * ___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); } /** * ___test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static __always_inline bool ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); } /** * _test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool _test_bit(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit(nr, addr); } /** * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool _test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit_acquire(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
1 1 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 // SPDX-License-Identifier: GPL-2.0 /* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_data { /* service tree for active throtl groups */ struct throtl_service_queue service_queue; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; unsigned int throtl_slice; /* Work for dispatching throttled bios */ struct work_struct dispatch_work; bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned. */ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL; } /** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it. */ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) { struct throtl_grp *tg = sq_to_tg(sq); if (tg) return tg->td; else return container_of(sq, struct throtl_data, service_queue); } static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; return tg->iops[rw]; } /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported * @fmt: printf format string * @args: printf args * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ } while (0) static inline unsigned int throtl_bio_data_size(struct bio *bio) { /* assume it's one sector */ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) return 512; return bio->bi_iter.bi_size; } static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); bio_list_init(&qn->bios); qn->tg = tg; } /** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @queued: the service_queue->queued[] list @qn belongs to * * Add @bio to @qn and put @qn on @queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct list_head *queued) { bio_list_add(&qn->bios, bio); if (list_empty(&qn->node)) { list_add_tail(&qn->node, queued); blkg_get(tg_to_blkg(qn->tg)); } } /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek */ static struct bio *throtl_peek_queued(struct list_head *queued) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list * @queued: the qnode list to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * * Pop the first bio from the qnode list @queued. After popping, the first * qnode is removed from @queued if empty or moved to the end of @queued so * that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); if (bio_list_empty(&qn->bios)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; else blkg_put(tg_to_blkg(qn->tg)); } else { list_move_tail(&qn->node, queued); } return bio; } /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[READ]); INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; if (blkg_rwstat_init(&tg->stat_bytes, gfp)) goto err_free_tg; if (blkg_rwstat_init(&tg->stat_ios, gfp)) goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } RB_CLEAR_NODE(&tg->rb_node); tg->bps[READ] = U64_MAX; tg->bps[WRITE] = U64_MAX; tg->iops[READ] = UINT_MAX; tg->iops[WRITE] = UINT_MAX; return &tg->pd; err_exit_stat_bytes: blkg_rwstat_exit(&tg->stat_bytes); err_free_tg: kfree(tg); return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } /* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct. */ static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || tg_bps_limit(tg, rw) != U64_MAX; } } static void throtl_pd_online(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ tg_update_has_rules(tg); } static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); if (!n) return NULL; return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; tg = throtl_rb_first(parent_sq); if (!tg) return; parent_sq->first_pending_disptime = tg->disptime; } static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; bool leftmost = true; while (*node != NULL) { parent = *node; __tg = rb_entry_tg(parent); if (time_before(key, __tg->disptime)) node = &parent->rb_left; else { node = &parent->rb_right; leftmost = false; } } rb_link_node(&tg->rb_node, parent, node); rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { if (!(tg->flags & THROTL_TG_PENDING)) { tg_service_queue_add(tg); tg->flags |= THROTL_TG_PENDING; tg->service_queue.parent_sq->nr_pending++; } } static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; throtl_rb_erase(&tg->rb_node, parent_sq); --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } /* Call with queue lock held */ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification. */ if (time_after(expires, max_expire)) expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); } /** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths. */ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force) { /* any pending children left? */ if (!sq->nr_pending) return true; update_min_dispatch_time(sq); /* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) { throtl_schedule_pending_timer(sq, sq->first_pending_disptime); return true; } /* tell the caller to continue dispatching */ return false; } static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, bool rw, unsigned long start) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; if (clear_carryover) { tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; return true; } static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { unsigned int io_allowed; u64 tmp; /* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) io_allowed = UINT_MAX; else io_allowed = tmp; return io_allowed; } static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) { /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation. */ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long time_elapsed; long long bytes_trim; int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); /* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ if (throtl_slice_used(tg, rw)) return; /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); if (!time_elapsed) return; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), time_elapsed) + tg->carryover_bytes[rw]; io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + tg->carryover_ios[rw]; if (bytes_trim <= 0 && io_trim <= 0) return; tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And * carryover_bytes/ios will be used to calculate new wait time under new * configuration. */ if (bps_limit != U64_MAX) tg->carryover_bytes[rw] += calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) tg->carryover_ios[rw] += calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; } static void tg_update_carryover(struct throtl_grp *tg) { if (tg->service_queue.nr_queued[READ]) __tg_update_carryover(tg, READ); if (tg->service_queue.nr_queued[WRITE]) __tg_update_carryover(tg, WRITE); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit) { bool rw = bio_data_dir(bio); int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; /* make sure at least one io can be dispatched after waiting */ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; /* * This wait time is without taking into consideration the rounding * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait; } /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched */ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued. */ BUG_ON(tg->service_queue.nr_queued[rw] && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; } /* * If previous slice expired, start a new one otherwise renew/extend * existing slice to make sure it is at least throtl_slice interval * long since now. New slice is started only for empty throttle group. * If there is queued bio, that means there should be an active * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); } bps_wait = tg_within_bps_limit(tg, bio, bps_limit); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; } max_wait = max(bps_wait, iops_wait); if (wait) *wait = max_wait; if (time_before(tg->slice_end[rw], jiffies + max_wait)) throtl_extend_slice(tg, rw, jiffies + max_wait); return false; } static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; } /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used. */ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); if (!qn) qn = &tg->qnode_on_self[rw]; /* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) tg_may_dispatch(tg, bio, &read_wait); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, struct throtl_grp *parent_tg, bool rw) { if (throtl_slice_used(parent_tg, rw)) { throtl_start_new_slice_with_credit(parent_tg, rw, child_tg->slice_start[rw]); } } static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *parent_sq = sq->parent_sq; struct throtl_grp *parent_tg = sq_to_tg(parent_sq); struct throtl_grp *tg_to_put = NULL; struct bio *bio; /* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios. */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } throtl_trim_slice(tg, rw); if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) break; } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) break; } return nr_reads + nr_writes; } static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; while (1) { struct throtl_grp *tg; struct throtl_service_queue *sq; if (!parent_sq->nr_pending) break; tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); else throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; } return nr_disp; } /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued. */ static void throtl_pending_timer_fn(struct timer_list *t) { struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret; /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) q = tg->pd.blkg->q; else q = td->queue; spin_lock_irq(&q->queue_lock); if (!q->root_blkg) goto out_unlock; again: parent_sq = sq->parent_sq; dispatched = false; while (true) { throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", sq->nr_queued[READ] + sq->nr_queued[WRITE], sq->nr_queued[READ], sq->nr_queued[WRITE]); ret = throtl_select_dispatch(sq); if (ret) { throtl_log(sq, "bios disp=%u", ret); dispatched = true; } if (throtl_schedule_next_dispatch(sq, false)) break; /* this dispatch windows is still open, relax and repeat */ spin_unlock_irq(&q->queue_lock); cpu_relax(); spin_lock_irq(&q->queue_lock); } if (!dispatched) goto out_unlock; if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ sq = parent_sq; tg = sq_to_tg(sq); goto again; } } } else { /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw; bio_list_init(&bio_list_on_stack); spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static int tg_print_conf_u64(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static int tg_print_conf_uint(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; } rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ throtl_start_new_slice(tg, READ, false); throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } } static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; unsigned int memflags; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); /* * Freeze queue before activating policy, to synchronize with IO path, * which is protected by 'q_usage_counter'. */ memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; /* activate policy */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; kfree(td); goto out; } if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; else td->throtl_slice = DFL_THROTL_SLICE_HD; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue, memflags); return ret; } static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; int ret; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) v = U64_MAX; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, true); } static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, false); } static int tg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_rwstat_recursive, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); u64 bps_dft; unsigned int iops_dft; if (!dname) return 0; bps_dft = U64_MAX; iops_dft = UINT_MAX; if (tg->bps[READ] == bps_dft && tg->bps[WRITE] == bps_dft && tg->iops[READ] == iops_dft && tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else seq_printf(sf, " rbps=%llu", tg->bps[READ]); if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else seq_printf(sf, " riops=%u", tg->iops[READ]); if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; } static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; int ret; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); v[0] = tg->bps[READ]; v[1] = tg->bps[WRITE]; v[2] = tg->iops[READ]; v[3] = tg->iops[WRITE]; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) break; if (tok[0] == '\0') break; ctx.body += len; ret = -EINVAL; p = tok; strsep(&p, "="); if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) goto out_finish; ret = -ERANGE; if (!val) goto out_finish; ret = -EINVAL; if (!strcmp(tok, "rbps")) v[0] = val; else if (!strcmp(tok, "wbps")) v[1] = val; else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; } tg->bps[READ] = v[0]; tg->bps[WRITE] = v[1]; tg->iops[READ] = v[2]; tg->iops[WRITE] = v[3]; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static struct cftype throtl_files[] = { { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, }, { } /* terminate */ }; static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; cancel_work_sync(&td->dispatch_work); } static void tg_flush_bios(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ tg->flags |= THROTL_TG_CANCELING; /* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree. */ if (!(tg->flags & THROTL_TG_PENDING)) return; /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. */ tg_update_disptime(tg); throtl_schedule_pending_timer(sq, jiffies + 1); } static void throtl_pd_offline(struct blkg_policy_data *pd) { tg_flush_bios(pd_to_tg(pd)); } struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; void blk_throtl_cancel_bios(struct gendisk *disk) { struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; if (!blk_throtl_activated(q)) return; spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk. */ tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { /* throtl is FIFO - if bios are already queued, should queue */ if (tg->service_queue.nr_queued[rw]) return false; return tg_may_dispatch(tg, bio, NULL); } static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) { if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); tg->carryover_ios[rw]--; } bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; rcu_read_lock(); spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); /* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(tg, rw); } else if (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * Debts are handled by carryover_bytes/ios while * calculating wait time. */ tg_dispatch_in_debt(tg, bio, rw); } else { /* if above limits, break to queue */ break; } /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) { bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; } } /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; /* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio. The forced scheduling isn't likely to * cause undue delay as @bio is likely to be dispatched directly if * its @tg's disptime is not in the future. */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; if (!blk_throtl_activated(q)) return; del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init);
10 10 10 10 10 10 6 6 6 6 10 10 6 6 6 10 10 6 10 9 1 10 9 10 10 1 10 10 10 9 9 6 6 6 6 6 5 5 5 14 14 7 9 5 9 9 4 4 7 4 7 4 2 8 8 8 2 2 2 7 7 1 1 7 5 5 5 5 5 2 5 4 5 5 1 1 4 5 4 1 1 4 4 1 2 4 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 1 1 5 1 1 1 1 5 5 9 1 1 9 9 5 9 9 9 9 9 9 1 1 1 1 6 1 1 1 6 6 6 1 1 6 6 2 1 1 1 3 1 6 6 9 4 5 9 5 6 1 1 2 2 2 2 2 2 2 2 2 2 1 2 2 5 5 5 5 5 5 4 2 5 5 5 1 1 1 6 6 1 5 5 3 5 5 5 5 5 5 5 6 5 3 6 1 6 6 6 6 6 5 3 1 6 6 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 7 7 1 6 6 6 6 6 6 6 6 6 1 5 5 6 7 6 1 6 6 1 5 5 1 6 6 7 10 2 8 10 10 8 2 7 10 10 9 2 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 // SPDX-License-Identifier: GPL-2.0-only /* * refcounttree.c * * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sort.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "inode.h" #include "alloc.h" #include "suballoc.h" #include "journal.h" #include "uptodate.h" #include "super.h" #include "buffer_head_io.h" #include "blockcheck.h" #include "refcounttree.h" #include "sysfile.h" #include "dlmglue.h" #include "extent_map.h" #include "aops.h" #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" #include "file.h" #include "symlink.h" #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/fsnotify.h> #include <linux/quotaops.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/posix_acl.h> struct ocfs2_cow_context { struct inode *inode; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; struct ocfs2_refcount_tree *ref_tree; struct buffer_head *ref_root_bh; struct ocfs2_alloc_context *meta_ac; struct ocfs2_alloc_context *data_ac; struct ocfs2_cached_dealloc_ctxt dealloc; void *cow_object; struct ocfs2_post_refcount *post_refcount; int extra_credits; int (*get_clusters)(struct ocfs2_cow_context *context, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; static inline struct ocfs2_refcount_tree * cache_info_to_refcount(struct ocfs2_caching_info *ci) { return container_of(ci, struct ocfs2_refcount_tree, rf_ci); } static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) { int rc; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)bh->b_data; trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); if (rc) { mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", (unsigned long long)bh->b_blocknr); return rc; } if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { rc = ocfs2_error(sb, "Refcount block #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, rb->rf_signature); goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { rc = ocfs2_error(sb, "Refcount block #%llu has an invalid rf_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(rb->rf_blkno)); goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { rc = ocfs2_error(sb, "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(rb->rf_fs_generation)); goto out; } out: return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, u64 rb_blkno, struct buffer_head **bh) { int rc; struct buffer_head *tmp = *bh; rc = ocfs2_read_block(ci, rb_blkno, &tmp, ocfs2_validate_refcount_block); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); return rf->rf_blkno; } static struct super_block * ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); return rf->rf_sb; } static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) __acquires(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); spin_lock(&rf->rf_lock); } static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) __releases(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); spin_unlock(&rf->rf_lock); } static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); mutex_lock(&rf->rf_io_mutex); } static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); mutex_unlock(&rf->rf_io_mutex); } static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { .co_owner = ocfs2_refcount_cache_owner, .co_get_super = ocfs2_refcount_cache_get_super, .co_cache_lock = ocfs2_refcount_cache_lock, .co_cache_unlock = ocfs2_refcount_cache_unlock, .co_io_lock = ocfs2_refcount_cache_io_lock, .co_io_unlock = ocfs2_refcount_cache_io_unlock, }; static struct ocfs2_refcount_tree * ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) { struct rb_node *n = osb->osb_rf_lock_tree.rb_node; struct ocfs2_refcount_tree *tree = NULL; while (n) { tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); if (blkno < tree->rf_blkno) n = n->rb_left; else if (blkno > tree->rf_blkno) n = n->rb_right; else return tree; } return NULL; } /* osb_lock is already locked. */ static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *new) { u64 rf_blkno = new->rf_blkno; struct rb_node *parent = NULL; struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; struct ocfs2_refcount_tree *tmp; while (*p) { parent = *p; tmp = rb_entry(parent, struct ocfs2_refcount_tree, rf_node); if (rf_blkno < tmp->rf_blkno) p = &(*p)->rb_left; else if (rf_blkno > tmp->rf_blkno) p = &(*p)->rb_right; else { /* This should never happen! */ mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", (unsigned long long)rf_blkno); BUG(); } } rb_link_node(&new->rf_node, parent, p); rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); } static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) { ocfs2_metadata_cache_exit(&tree->rf_ci); ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); ocfs2_lock_res_free(&tree->rf_lockres); kfree(tree); } static inline void ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree) { rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) osb->osb_ref_tree_lru = NULL; } static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree) { spin_lock(&osb->osb_lock); ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); spin_unlock(&osb->osb_lock); } static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); ocfs2_free_refcount_tree(tree); } static inline void ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) { kref_get(&tree->rf_getcnt); } static inline void ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) { kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); } static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, struct super_block *sb) { ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); mutex_init(&new->rf_io_mutex); new->rf_sb = sb; spin_lock_init(&new->rf_lock); } static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, struct ocfs2_refcount_tree *new, u64 rf_blkno, u32 generation) { init_rwsem(&new->rf_sem); ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, rf_blkno, generation); } static struct ocfs2_refcount_tree* ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) { struct ocfs2_refcount_tree *new; new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); if (!new) return NULL; new->rf_blkno = rf_blkno; kref_init(&new->rf_getcnt); ocfs2_init_refcount_tree_ci(new, osb->sb); return new; } static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, struct ocfs2_refcount_tree **ret_tree) { int ret = 0; struct ocfs2_refcount_tree *tree, *new = NULL; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_block *ref_rb; spin_lock(&osb->osb_lock); if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru->rf_blkno == rf_blkno) tree = osb->osb_ref_tree_lru; else tree = ocfs2_find_refcount_tree(osb, rf_blkno); if (tree) goto out; spin_unlock(&osb->osb_lock); new = ocfs2_allocate_refcount_tree(osb, rf_blkno); if (!new) { ret = -ENOMEM; mlog_errno(ret); return ret; } /* * We need the generation to create the refcount tree lock and since * it isn't changed during the tree modification, we are safe here to * read without protection. * We also have to purge the cache after we create the lock since the * refcount block may have the stale data. It can only be trusted when * we hold the refcount lock. */ ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); if (ret) { mlog_errno(ret); ocfs2_metadata_cache_exit(&new->rf_ci); kfree(new); return ret; } ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; new->rf_generation = le32_to_cpu(ref_rb->rf_generation); ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, new->rf_generation); ocfs2_metadata_cache_purge(&new->rf_ci); spin_lock(&osb->osb_lock); tree = ocfs2_find_refcount_tree(osb, rf_blkno); if (tree) goto out; ocfs2_insert_refcount_tree(osb, new); tree = new; new = NULL; out: *ret_tree = tree; osb->osb_ref_tree_lru = tree; spin_unlock(&osb->osb_lock); if (new) ocfs2_free_refcount_tree(new); brelse(ref_root_bh); return ret; } static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) { int ret; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; } BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); brelse(di_bh); out: return ret; } static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { int ret; ret = ocfs2_refcount_lock(tree, rw); if (ret) { mlog_errno(ret); goto out; } if (rw) down_write(&tree->rf_sem); else down_read(&tree->rf_sem); out: return ret; } /* * Lock the refcount tree pointed by ref_blkno and return the tree. * In most case, we lock the tree and read the refcount block. * So read it here if the caller really needs it. * * If the tree has been re-created by other node, it will free the * old one and re-create it. */ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, struct ocfs2_refcount_tree **ret_tree, struct buffer_head **ref_bh) { int ret, delete_tree = 0; struct ocfs2_refcount_tree *tree = NULL; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_block *rb; again: ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); if (ret) { mlog_errno(ret); return ret; } ocfs2_refcount_tree_get(tree); ret = __ocfs2_lock_refcount_tree(osb, tree, rw); if (ret) { mlog_errno(ret); ocfs2_refcount_tree_put(tree); goto out; } ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, &ref_root_bh); if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); goto out; } rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; /* * If the refcount block has been freed and re-created, we may need * to recreate the refcount tree also. * * Here we just remove the tree from the rb-tree, and the last * kref holder will unlock and delete this refcount_tree. * Then we goto "again" and ocfs2_get_refcount_tree will create * the new refcount tree for us. */ if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { if (!tree->rf_removed) { ocfs2_erase_refcount_tree_from_list(osb, tree); tree->rf_removed = 1; delete_tree = 1; } ocfs2_unlock_refcount_tree(osb, tree, rw); /* * We get an extra reference when we create the refcount * tree, so another put will destroy it. */ if (delete_tree) ocfs2_refcount_tree_put(tree); brelse(ref_root_bh); ref_root_bh = NULL; goto again; } *ret_tree = tree; if (ref_bh) { *ref_bh = ref_root_bh; ref_root_bh = NULL; } out: brelse(ref_root_bh); return ret; } void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { if (rw) up_write(&tree->rf_sem); else up_read(&tree->rf_sem); ocfs2_refcount_unlock(tree, rw); ocfs2_refcount_tree_put(tree); } void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) { struct rb_node *node; struct ocfs2_refcount_tree *tree; struct rb_root *root = &osb->osb_rf_lock_tree; while ((node = rb_last(root)) != NULL) { tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); trace_ocfs2_purge_refcount_trees( (unsigned long long) tree->rf_blkno); rb_erase(&tree->rf_node, root); ocfs2_free_refcount_tree(tree); } } /* * Create a refcount tree for an inode. * We take for granted that the inode is already locked. */ static int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { int ret; handle_t *handle = NULL; struct ocfs2_alloc_context *meta_ac = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; u16 suballoc_bit_start; u32 num_got; u64 suballoc_loc, first_blkno; BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( (unsigned long long)oi->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { mlog_errno(ret); goto out; } handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); if (ret) { mlog_errno(ret); goto out_commit; } new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); if (!new_tree) { ret = -ENOMEM; mlog_errno(ret); goto out_commit; } new_bh = sb_getblk(inode->i_sb, first_blkno); if (!new_bh) { ret = -ENOMEM; mlog_errno(ret); goto out_commit; } ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; } /* Initialize ocfs2_refcount_block. */ rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(rb, 0, inode->i_sb->s_blocksize); strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); rb->rf_blkno = cpu_to_le64(first_blkno); rb->rf_count = cpu_to_le32(1); rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); spin_lock(&osb->osb_lock); rb->rf_generation = cpu_to_le32(osb->s_next_generation++); spin_unlock(&osb->osb_lock); ocfs2_journal_dirty(handle, new_bh); spin_lock(&oi->ip_lock); oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); di->i_refcount_loc = cpu_to_le64(first_blkno); spin_unlock(&oi->ip_lock); trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno); ocfs2_journal_dirty(handle, di_bh); /* * We have to init the tree lock here since it will use * the generation number to create it. */ new_tree->rf_generation = le32_to_cpu(rb->rf_generation); ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, new_tree->rf_generation); spin_lock(&osb->osb_lock); tree = ocfs2_find_refcount_tree(osb, first_blkno); /* * We've just created a new refcount tree in this block. If * we found a refcount tree on the ocfs2_super, it must be * one we just deleted. We free the old tree before * inserting the new tree. */ BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); if (tree) ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); ocfs2_insert_refcount_tree(osb, new_tree); spin_unlock(&osb->osb_lock); new_tree = NULL; if (tree) ocfs2_refcount_tree_put(tree); out_commit: ocfs2_commit_trans(osb, handle); out: if (new_tree) { ocfs2_metadata_cache_exit(&new_tree->rf_ci); kfree(new_tree); } brelse(new_bh); if (meta_ac) ocfs2_free_alloc_context(meta_ac); return ret; } static int ocfs2_set_refcount_tree(struct inode *inode, struct buffer_head *di_bh, u64 refcount_loc) { int ret; handle_t *handle = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); return ret; } handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; le32_add_cpu(&rb->rf_count, 1); ocfs2_journal_dirty(handle, ref_root_bh); spin_lock(&oi->ip_lock); oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); di->i_refcount_loc = cpu_to_le64(refcount_loc); spin_unlock(&oi->ip_lock); ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(osb, handle); out: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); return ret; } int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { int ret, delete_tree = 0; handle_t *handle = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_refcount_block *rb; struct inode *alloc_inode = NULL; struct buffer_head *alloc_bh = NULL; struct buffer_head *blk_bh = NULL; struct ocfs2_refcount_tree *ref_tree; int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); if (ret) { mlog_errno(ret); return ret; } rb = (struct ocfs2_refcount_block *)blk_bh->b_data; /* * If we are the last user, we need to free the block. * So lock the allocator ahead. */ if (le32_to_cpu(rb->rf_count) == 1) { blk = le64_to_cpu(rb->rf_blkno); bit = le16_to_cpu(rb->rf_suballoc_bit); if (rb->rf_suballoc_loc) bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); else bg_blkno = ocfs2_which_suballoc_group(blk, bit); alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot)); if (!alloc_inode) { ret = -ENOMEM; mlog_errno(ret); goto out; } inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { mlog_errno(ret); goto out_mutex; } credits += OCFS2_SUBALLOC_FREE; } handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } spin_lock(&oi->ip_lock); oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); di->i_refcount_loc = 0; spin_unlock(&oi->ip_lock); ocfs2_journal_dirty(handle, di_bh); le32_add_cpu(&rb->rf_count , -1); ocfs2_journal_dirty(handle, blk_bh); if (!rb->rf_count) { delete_tree = 1; ocfs2_erase_refcount_tree_from_list(osb, ref_tree); ret = ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, bit, bg_blkno, 1); if (ret) mlog_errno(ret); } out_commit: ocfs2_commit_trans(osb, handle); out_unlock: if (alloc_inode) { ocfs2_inode_unlock(alloc_inode, 1); brelse(alloc_bh); } out_mutex: if (alloc_inode) { inode_unlock(alloc_inode); iput(alloc_inode); } out: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); if (delete_tree) ocfs2_refcount_tree_put(ref_tree); brelse(blk_bh); return ret; } static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, struct buffer_head *ref_leaf_bh, u64 cpos, unsigned int len, struct ocfs2_refcount_rec *ret_rec, int *index) { int i = 0; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_rec *rec = NULL; for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { rec = &rb->rf_records.rl_recs[i]; if (le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters) <= cpos) continue; else if (le64_to_cpu(rec->r_cpos) > cpos) break; /* ok, cpos fail in this rec. Just return. */ if (ret_rec) *ret_rec = *rec; goto out; } if (ret_rec) { /* We meet with a hole here, so fake the rec. */ ret_rec->r_cpos = cpu_to_le64(cpos); ret_rec->r_refcount = 0; if (i < le16_to_cpu(rb->rf_records.rl_used) && le64_to_cpu(rec->r_cpos) < cpos + len) ret_rec->r_clusters = cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); else ret_rec->r_clusters = cpu_to_le32(len); } out: *index = i; } /* * Try to remove refcount tree. The mechanism is: * 1) Check whether i_clusters == 0, if no, exit. * 2) check whether we have i_xattr_loc in dinode. if yes, exit. * 3) Check whether we have inline xattr stored outside, if yes, exit. * 4) Remove the tree. */ int ocfs2_try_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; down_write(&oi->ip_xattr_sem); down_write(&oi->ip_alloc_sem); if (oi->ip_clusters) goto out; if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) goto out; if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && ocfs2_has_inline_xattr_value_outside(inode, di)) goto out; ret = ocfs2_remove_refcount_tree(inode, di_bh); if (ret) mlog_errno(ret); out: up_write(&oi->ip_alloc_sem); up_write(&oi->ip_xattr_sem); return 0; } /* * Find the end range for a leaf refcount block indicated by * el->l_recs[index].e_blkno. */ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct ocfs2_extent_block *eb, struct ocfs2_extent_list *el, int index, u32 *cpos_end) { int ret, i, subtree_root; u32 cpos; u64 blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_path *left_path = NULL, *right_path = NULL; struct ocfs2_extent_tree et; struct ocfs2_extent_list *tmp_el; if (index < le16_to_cpu(el->l_next_free_rec) - 1) { /* * We have a extent rec after index, so just use the e_cpos * of the next extent rec. */ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); return 0; } if (!eb || !eb->h_next_leaf_blk) { /* * We are the last extent rec, so any high cpos should * be stored in this leaf refcount block. */ *cpos_end = UINT_MAX; return 0; } /* * If the extent block isn't the last one, we have to find * the subtree root between this extent block and the next * leaf extent block and get the corresponding e_cpos from * the subroot. Otherwise we may corrupt the b-tree. */ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); left_path = ocfs2_new_path_from_et(&et); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); ret = ocfs2_find_path(ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; } right_path = ocfs2_new_path_from_path(left_path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_find_path(ci, right_path, cpos); if (ret) { mlog_errno(ret); goto out; } subtree_root = ocfs2_find_subtree_root(&et, left_path, right_path); tmp_el = left_path->p_node[subtree_root].el; blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) { if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); break; } } BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec)); out: ocfs2_free_path(left_path); ocfs2_free_path(right_path); return ret; } /* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos * and end at a small value between cpos+len and start of the next record. * This fake record has r_refcount = 0. */ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, unsigned int len, struct ocfs2_refcount_rec *ret_rec, int *index, struct buffer_head **ret_bh) { int ret = 0, i, found; u32 low_cpos, cpos_end; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec = NULL; struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, ret_rec, index); *ret_bh = ref_root_bh; get_bh(ref_root_bh); return 0; } el = &rb->rf_list; low_cpos = cpos & OCFS2_32BIT_POS_MASK; if (el->l_tree_depth) { ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; if (el->l_tree_depth) { ret = ocfs2_error(sb, "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)eb_bh->b_blocknr); goto out; } } found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; if (le32_to_cpu(rec->e_cpos) <= low_cpos) { found = 1; break; } } if (found) { ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, eb, el, i, &cpos_end); if (ret) { mlog_errno(ret); goto out; } if (cpos_end < low_cpos + len) len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, ret_rec, index); *ret_bh = ref_leaf_bh; out: brelse(eb_bh); return ret; } enum ocfs2_ref_rec_contig { REF_CONTIG_NONE = 0, REF_CONTIG_LEFT, REF_CONTIG_RIGHT, REF_CONTIG_LEFTRIGHT, }; static enum ocfs2_ref_rec_contig ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, int index) { if ((rb->rf_records.rl_recs[index].r_refcount == rb->rf_records.rl_recs[index + 1].r_refcount) && (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) return REF_CONTIG_RIGHT; return REF_CONTIG_NONE; } static enum ocfs2_ref_rec_contig ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, int index) { enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) ret = ocfs2_refcount_rec_adjacent(rb, index); if (index > 0) { enum ocfs2_ref_rec_contig tmp; tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); if (tmp == REF_CONTIG_RIGHT) { if (ret == REF_CONTIG_RIGHT) ret = REF_CONTIG_LEFTRIGHT; else ret = REF_CONTIG_LEFT; } } return ret; } static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, int index) { BUG_ON(rb->rf_records.rl_recs[index].r_refcount != rb->rf_records.rl_recs[index+1].r_refcount); le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) memmove(&rb->rf_records.rl_recs[index + 1], &rb->rf_records.rl_recs[index + 2], sizeof(struct ocfs2_refcount_rec) * (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], 0, sizeof(struct ocfs2_refcount_rec)); le16_add_cpu(&rb->rf_records.rl_used, -1); } /* * Merge the refcount rec if we are contiguous with the adjacent recs. */ static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, int index) { enum ocfs2_ref_rec_contig contig = ocfs2_refcount_rec_contig(rb, index); if (contig == REF_CONTIG_NONE) return; if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { BUG_ON(index == 0); index--; } ocfs2_rotate_refcount_rec_left(rb, index); if (contig == REF_CONTIG_LEFTRIGHT) ocfs2_rotate_refcount_rec_left(rb, index); } /* * Change the refcount indexed by "index" in ref_bh. * If refcount reaches 0, remove it. */ static int ocfs2_change_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_leaf_bh, int index, int merge, int change) { int ret; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rl = &rb->rf_records; struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_change_refcount_rec( (unsigned long long)ocfs2_metadata_cache_owner(ci), index, le32_to_cpu(rec->r_refcount), change); le32_add_cpu(&rec->r_refcount, change); if (!rec->r_refcount) { if (index != le16_to_cpu(rl->rl_used) - 1) { memmove(rec, rec + 1, (le16_to_cpu(rl->rl_used) - index - 1) * sizeof(struct ocfs2_refcount_rec)); memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], 0, sizeof(struct ocfs2_refcount_rec)); } le16_add_cpu(&rl->rl_used, -1); } else if (merge) ocfs2_refcount_rec_merge(rb, index); ocfs2_journal_dirty(handle, ref_leaf_bh); out: return ret; } static int ocfs2_expand_inline_ref_root(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head **ref_leaf_bh, struct ocfs2_alloc_context *meta_ac) { int ret; u16 suballoc_bit_start; u32 num_got; u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { mlog_errno(ret); goto out; } new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } ocfs2_set_new_buffer_uptodate(ci, new_bh); ret = ocfs2_journal_access_rb(handle, ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out; } /* * Initialize ocfs2_refcount_block. * It should contain the same information as the old root. * so just memcpy it and change the corresponding field. */ memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_cpos = cpu_to_le32(0); new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); ocfs2_journal_dirty(handle, new_bh); /* Now change the root. */ memset(&root_rb->rf_list, 0, sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_list)); root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); root_rb->rf_clusters = cpu_to_le32(1); root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); ocfs2_journal_dirty(handle, ref_root_bh); trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno, le16_to_cpu(new_rb->rf_records.rl_used)); *ref_leaf_bh = new_bh; new_bh = NULL; out: brelse(new_bh); return ret; } static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, struct ocfs2_refcount_rec *next) { if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= ocfs2_get_ref_rec_low_cpos(next)) return 1; return 0; } static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) { const struct ocfs2_refcount_rec *l = a, *r = b; u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); if (l_cpos > r_cpos) return 1; if (l_cpos < r_cpos) return -1; return 0; } static int cmp_refcount_rec_by_cpos(const void *a, const void *b) { const struct ocfs2_refcount_rec *l = a, *r = b; u64 l_cpos = le64_to_cpu(l->r_cpos); u64 r_cpos = le64_to_cpu(r->r_cpos); if (l_cpos > r_cpos) return 1; if (l_cpos < r_cpos) return -1; return 0; } /* * The refcount cpos are ordered by their 64bit cpos, * But we will use the low 32 bit to be the e_cpos in the b-tree. * So we need to make sure that this pos isn't intersected with others. * * Note: The refcount block is already sorted by their low 32 bit cpos, * So just try the middle pos first, and we will exit when we find * the good position. */ static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, u32 *split_pos, int *split_index) { int num_used = le16_to_cpu(rl->rl_used); int delta, middle = num_used / 2; for (delta = 0; delta < middle; delta++) { /* Let's check delta earlier than middle */ if (ocfs2_refcount_rec_no_intersect( &rl->rl_recs[middle - delta - 1], &rl->rl_recs[middle - delta])) { *split_index = middle - delta; break; } /* For even counts, don't walk off the end */ if ((middle + delta + 1) == num_used) continue; /* Now try delta past middle */ if (ocfs2_refcount_rec_no_intersect( &rl->rl_recs[middle + delta], &rl->rl_recs[middle + delta + 1])) { *split_index = middle + delta + 1; break; } } if (delta >= middle) return -ENOSPC; *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); return 0; } static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, struct buffer_head *new_bh, u32 *split_cpos) { int split_index = 0, num_moved, ret; u32 cpos = 0; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rl = &rb->rf_records; struct ocfs2_refcount_block *new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; trace_ocfs2_divide_leaf_refcount_block( (unsigned long long)ref_leaf_bh->b_blocknr, le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); /* * XXX: Improvement later. * If we know all the high 32 bit cpos is the same, no need to sort. * * In order to make the whole process safe, we do: * 1. sort the entries by their low 32 bit cpos first so that we can * find the split cpos easily. * 2. call ocfs2_insert_extent to insert the new refcount block. * 3. move the refcount rec to the new block. * 4. sort the entries by their 64 bit cpos. * 5. dirty the new_rb and rb. */ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_low_cpos, NULL); ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); if (ret) { mlog_errno(ret); return ret; } new_rb->rf_cpos = cpu_to_le32(cpos); /* move refcount records starting from split_index to the new block. */ num_moved = le16_to_cpu(rl->rl_used) - split_index; memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], num_moved * sizeof(struct ocfs2_refcount_rec)); /*ok, remove the entries we just moved over to the other block. */ memset(&rl->rl_recs[split_index], 0, num_moved * sizeof(struct ocfs2_refcount_rec)); /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_cpos, NULL); sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_cpos, NULL); *split_cpos = cpos; return 0; } static int ocfs2_new_leaf_refcount_block(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac) { int ret; u16 suballoc_bit_start; u32 num_got, new_cpos; u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; struct ocfs2_extent_tree ref_et; BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { mlog_errno(ret); goto out; } new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } ocfs2_set_new_buffer_uptodate(ci, new_bh); ret = ocfs2_journal_access_rb(handle, ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out; } /* Initialize ocfs2_refcount_block. */ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); new_rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); new_rb->rf_generation = root_rb->rf_generation; ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); if (ret) { mlog_errno(ret); goto out; } ocfs2_journal_dirty(handle, ref_leaf_bh); ocfs2_journal_dirty(handle, new_bh); ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); trace_ocfs2_new_leaf_refcount_block( (unsigned long long)new_bh->b_blocknr, new_cpos); /* Insert the new leaf block with the specific offset cpos. */ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1, 0, meta_ac); if (ret) mlog_errno(ret); out: brelse(new_bh); return ret; } static int ocfs2_expand_refcount_tree(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac) { int ret; struct buffer_head *expand_bh = NULL; if (ref_root_bh == ref_leaf_bh) { /* * the old root bh hasn't been expanded to a b-tree, * so expand it first. */ ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, &expand_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } } else { expand_bh = ref_leaf_bh; get_bh(expand_bh); } /* Now add a new refcount block into the tree.*/ ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, expand_bh, meta_ac); if (ret) mlog_errno(ret); out: brelse(expand_bh); return ret; } /* * Adjust the extent rec in b-tree representing ref_leaf_bh. * * Only called when we have inserted a new refcount rec at index 0 * which means ocfs2_extent_rec.e_cpos may need some change. */ static int ocfs2_adjust_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *rec) { int ret = 0, i; u32 new_cpos, old_cpos; struct ocfs2_path *path = NULL; struct ocfs2_extent_tree et; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; struct ocfs2_extent_list *el; if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) goto out; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; old_cpos = le32_to_cpu(rb->rf_cpos); new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; if (old_cpos <= new_cpos) goto out; ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); path = ocfs2_new_path_from_et(&et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_path(ci, path, old_cpos); if (ret) { mlog_errno(ret); goto out; } /* * 2 more credits, one for the leaf refcount block, one for * the extent block contains the extent rec. */ ret = ocfs2_extend_trans(handle, 2); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } /* change the leaf extent block first. */ el = path_leaf_el(path); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) break; BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); /* change the r_cpos in the leaf block. */ rb->rf_cpos = cpu_to_le32(new_cpos); ocfs2_journal_dirty(handle, path_leaf_bh(path)); ocfs2_journal_dirty(handle, ref_leaf_bh); out: ocfs2_free_path(path); return ret; } static int ocfs2_insert_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *rec, int index, int merge, struct ocfs2_alloc_context *meta_ac) { int ret; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rf_list = &rb->rf_records; struct buffer_head *new_bh = NULL; BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); if (rf_list->rl_used == rf_list->rl_count) { u64 cpos = le64_to_cpu(rec->r_cpos); u32 len = le32_to_cpu(rec->r_clusters); ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, NULL, &index, &new_bh); if (ret) { mlog_errno(ret); goto out; } ref_leaf_bh = new_bh; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; rf_list = &rb->rf_records; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } if (index < le16_to_cpu(rf_list->rl_used)) memmove(&rf_list->rl_recs[index + 1], &rf_list->rl_recs[index], (le16_to_cpu(rf_list->rl_used) - index) * sizeof(struct ocfs2_refcount_rec)); trace_ocfs2_insert_refcount_rec( (unsigned long long)ref_leaf_bh->b_blocknr, index, (unsigned long long)le64_to_cpu(rec->r_cpos), le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount)); rf_list->rl_recs[index] = *rec; le16_add_cpu(&rf_list->rl_used, 1); if (merge) ocfs2_refcount_rec_merge(rb, index); ocfs2_journal_dirty(handle, ref_leaf_bh); if (index == 0) { ret = ocfs2_adjust_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, rec); if (ret) mlog_errno(ret); } out: brelse(new_bh); return ret; } /* * Split the refcount_rec indexed by "index" in ref_leaf_bh. * This is much simple than our b-tree code. * split_rec is the new refcount rec we want to insert. * If split_rec->r_refcount > 0, we are changing the refcount(in case we * increase refcount or decrease a refcount to non-zero). * If split_rec->r_refcount == 0, we are punching a hole in current refcount * rec( in case we decrease a refcount to zero). */ static int ocfs2_split_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *split_rec, int index, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, recs_need; u32 len; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rf_list = &rb->rf_records; struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; struct ocfs2_refcount_rec *tail_rec = NULL; struct buffer_head *new_bh = NULL; BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), le32_to_cpu(orig_rec->r_refcount), le64_to_cpu(split_rec->r_cpos), le32_to_cpu(split_rec->r_clusters), le32_to_cpu(split_rec->r_refcount)); /* * If we just need to split the header or tail clusters, * no more recs are needed, just split is OK. * Otherwise we at least need one new recs. */ if (!split_rec->r_refcount && (split_rec->r_cpos == orig_rec->r_cpos || le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters) == le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) recs_need = 0; else recs_need = 1; /* * We need one more rec if we split in the middle and the new rec have * some refcount in it. */ if (split_rec->r_refcount && (split_rec->r_cpos != orig_rec->r_cpos && le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters) != le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) recs_need++; /* If the leaf block don't have enough record, expand it. */ if (le16_to_cpu(rf_list->rl_used) + recs_need > le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } /* * We have to re-get it since now cpos may be moved to * another leaf block. */ ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &tmp_rec, &index, &new_bh); if (ret) { mlog_errno(ret); goto out; } ref_leaf_bh = new_bh; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; rf_list = &rb->rf_records; orig_rec = &rf_list->rl_recs[index]; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } /* * We have calculated out how many new records we need and store * in recs_need, so spare enough space first by moving the records * after "index" to the end. */ if (index != le16_to_cpu(rf_list->rl_used) - 1) memmove(&rf_list->rl_recs[index + 1 + recs_need], &rf_list->rl_recs[index + 1], (le16_to_cpu(rf_list->rl_used) - index - 1) * sizeof(struct ocfs2_refcount_rec)); len = (le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)) - (le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters)); /* * If we have "len", the we will split in the tail and move it * to the end of the space we have just spared. */ if (len) { tail_rec = &rf_list->rl_recs[index + recs_need]; memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); tail_rec->r_clusters = cpu_to_le32(len); } /* * If the split pos isn't the same as the original one, we need to * split in the head. * * Note: We have the chance that split_rec.r_refcount = 0, * recs_need = 0 and len > 0, which means we just cut the head from * the orig_rec and in that case we have done some modification in * orig_rec above, so the check for r_cpos is faked. */ if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { len = le64_to_cpu(split_rec->r_cpos) - le64_to_cpu(orig_rec->r_cpos); orig_rec->r_clusters = cpu_to_le32(len); index++; } le16_add_cpu(&rf_list->rl_used, recs_need); if (split_rec->r_refcount) { rf_list->rl_recs[index] = *split_rec; trace_ocfs2_split_refcount_rec_insert( (unsigned long long)ref_leaf_bh->b_blocknr, index, (unsigned long long)le64_to_cpu(split_rec->r_cpos), le32_to_cpu(split_rec->r_clusters), le32_to_cpu(split_rec->r_refcount)); if (merge) ocfs2_refcount_rec_merge(rb, index); } ocfs2_journal_dirty(handle, ref_leaf_bh); out: brelse(new_bh); return ret; } static int __ocfs2_increase_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0, index; struct buffer_head *ref_leaf_bh = NULL; struct ocfs2_refcount_rec rec; unsigned int set_len = 0; trace_ocfs2_increase_refcount_begin( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } set_len = le32_to_cpu(rec.r_clusters); /* * Here we may meet with 3 situations: * * 1. If we find an already existing record, and the length * is the same, cool, we just need to increase the r_refcount * and it is OK. * 2. If we find a hole, just insert it with r_refcount = 1. * 3. If we are in the middle of one extent record, split * it. */ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && set_len <= len) { trace_ocfs2_increase_refcount_change( (unsigned long long)cpos, set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_change_refcount_rec(handle, ci, ref_leaf_bh, index, merge, 1); if (ret) { mlog_errno(ret); goto out; } } else if (!rec.r_refcount) { rec.r_refcount = cpu_to_le32(1); trace_ocfs2_increase_refcount_insert( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len); ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &rec, index, merge, meta_ac); if (ret) { mlog_errno(ret); goto out; } } else { set_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + set_len) - cpos; rec.r_cpos = cpu_to_le64(cpos); rec.r_clusters = cpu_to_le32(set_len); le32_add_cpu(&rec.r_refcount, 1); trace_ocfs2_increase_refcount_split( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &rec, index, merge, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } } cpos += set_len; len -= set_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } out: brelse(ref_leaf_bh); return ret; } static int ocfs2_remove_refcount_extent(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_extent_tree et; BUG_ON(rb->rf_records.rl_used); trace_ocfs2_remove_refcount_extent( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)ref_leaf_bh->b_blocknr, le32_to_cpu(rb->rf_cpos)); ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 1, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } ocfs2_remove_from_cache(ci, ref_leaf_bh); /* * add the freed block to the dealloc so that it will be freed * when we run dealloc. */ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot), le64_to_cpu(rb->rf_suballoc_loc), le64_to_cpu(rb->rf_blkno), le16_to_cpu(rb->rf_suballoc_bit)); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; le32_add_cpu(&rb->rf_clusters, -1); /* * check whether we need to restore the root refcount block if * there is no leaf extent block at atll. */ if (!rb->rf_list.l_next_free_rec) { BUG_ON(rb->rf_clusters); trace_ocfs2_restore_refcount_block( (unsigned long long)ref_root_bh->b_blocknr); rb->rf_flags = 0; rb->rf_parent = 0; rb->rf_cpos = 0; memset(&rb->rf_records, 0, sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_records)); rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); } ocfs2_journal_dirty(handle, ref_root_bh); out: return ret; } int ocfs2_increase_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { return __ocfs2_increase_refcount(handle, ci, ref_root_bh, cpos, len, 1, meta_ac, dealloc); } static int ocfs2_decrease_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, int index, u64 cpos, unsigned int len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); BUG_ON(cpos + len > le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); trace_ocfs2_decrease_refcount_rec( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len); if (cpos == le64_to_cpu(rec->r_cpos) && len == le32_to_cpu(rec->r_clusters)) ret = ocfs2_change_refcount_rec(handle, ci, ref_leaf_bh, index, 1, -1); else { struct ocfs2_refcount_rec split = *rec; split.r_cpos = cpu_to_le64(cpos); split.r_clusters = cpu_to_le32(len); le32_add_cpu(&split.r_refcount, -1); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &split, index, 1, meta_ac, dealloc); } if (ret) { mlog_errno(ret); goto out; } /* Remove the leaf refcount block if it contains no refcount record. */ if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac, dealloc); if (ret) mlog_errno(ret); } out: return ret; } static int __ocfs2_decrease_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete) { int ret = 0, index = 0; struct ocfs2_refcount_rec rec; unsigned int r_count = 0, r_len; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *ref_leaf_bh = NULL; trace_ocfs2_decrease_refcount( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len, delete); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } r_count = le32_to_cpu(rec.r_refcount); BUG_ON(r_count == 0); if (!delete) BUG_ON(r_count > 1); r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, index, cpos, r_len, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } if (le32_to_cpu(rec.r_refcount) == 1 && delete) { ret = ocfs2_cache_cluster_dealloc(dealloc, ocfs2_clusters_to_blocks(sb, cpos), r_len); if (ret) { mlog_errno(ret); goto out; } } cpos += r_len; len -= r_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } out: brelse(ref_leaf_bh); return ret; } /* Caller must hold refcount tree lock. */ int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete) { int ret; u64 ref_blkno; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, cpos, len, meta_ac, dealloc, delete); if (ret) mlog_errno(ret); out: brelse(ref_root_bh); return ret; } /* * Mark the already-existing extent at cpos as refcounted for len clusters. * This adds the refcount extent flag. * * If the existing extent is larger than the request, initiate a * split. An attempt will be made at merging with adjacent extents. * * The caller is responsible for passing down meta_ac if we'll need it. */ static int ocfs2_mark_extent_refcounted(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } ret = ocfs2_change_extent_flag(handle, et, cpos, len, phys, meta_ac, dealloc, OCFS2_EXT_REFCOUNTED, 0); if (ret) mlog_errno(ret); out: return ret; } /* * Given some contiguous physical clusters, calculate what we need * for modifying their refcount. */ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 start_cpos, u32 clusters, int *meta_add, int *credits) { int ret = 0, index, ref_blocks = 0, recs_add = 0; u64 cpos = start_cpos; struct ocfs2_refcount_block *rb; struct ocfs2_refcount_rec rec; struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; u32 len; while (clusters) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } if (ref_leaf_bh != prev_bh) { /* * Now we encounter a new leaf block, so calculate * whether we need to extend the old leaf. */ if (prev_bh) { rb = (struct ocfs2_refcount_block *) prev_bh->b_data; if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; } recs_add = 0; *credits += 1; brelse(prev_bh); prev_bh = ref_leaf_bh; get_bh(prev_bh); } trace_ocfs2_calc_refcount_meta_credits_iterate( recs_add, (unsigned long long)cpos, clusters, (unsigned long long)le64_to_cpu(rec.r_cpos), le32_to_cpu(rec.r_clusters), le32_to_cpu(rec.r_refcount), index); len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; /* * We record all the records which will be inserted to the * same refcount block, so that we can tell exactly whether * we need a new refcount block or not. * * If we will insert a new one, this is easy and only happens * during adding refcounted flag to the extent, so we don't * have a chance of splitting. We just need one record. * * If the refcount rec already exists, that would be a little * complicated. we may have to: * 1) split at the beginning if the start pos isn't aligned. * we need 1 more record in this case. * 2) split int the end if the end pos isn't aligned. * we need 1 more record in this case. * 3) split in the middle because of file system fragmentation. * we need 2 more records in this case(we can't detect this * beforehand, so always think of the worst case). */ if (rec.r_refcount) { recs_add += 2; /* Check whether we need a split at the beginning. */ if (cpos == start_cpos && cpos != le64_to_cpu(rec.r_cpos)) recs_add++; /* Check whether we need a split in the end. */ if (cpos + clusters < le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) recs_add++; } else recs_add++; brelse(ref_leaf_bh); ref_leaf_bh = NULL; clusters -= len; cpos += len; } if (prev_bh) { rb = (struct ocfs2_refcount_block *)prev_bh->b_data; if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; *credits += 1; } if (!ref_blocks) goto out; *meta_add += ref_blocks; *credits += ref_blocks; /* * So we may need ref_blocks to insert into the tree. * That also means we need to change the b-tree and add that number * of records since we never merge them. * We need one more block for expansion since the new created leaf * block is also full and needs split. */ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { struct ocfs2_extent_tree et; ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); *meta_add += ocfs2_extend_meta_needed(et.et_root_el); *credits += ocfs2_calc_extend_credits(sb, et.et_root_el); } else { *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; *meta_add += 1; } out: trace_ocfs2_calc_refcount_meta_credits( (unsigned long long)start_cpos, clusters, *meta_add, *credits); brelse(ref_leaf_bh); brelse(prev_bh); return ret; } /* * For refcount tree, we will decrease some contiguous clusters * refcount count, so just go through it to see how many blocks * we gonna touch and whether we need to create new blocks. * * Normally the refcount blocks store these refcount should be * contiguous also, so that we can get the number easily. * We will at most add split 2 refcount records and 2 more * refcount blocks, so just check it in a rough way. * * Caller must hold refcount tree lock. */ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, int *ref_blocks) { int ret; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, &tree->rf_ci, ref_root_bh, start_cpos, clusters, ref_blocks, credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits); out: brelse(ref_root_bh); return ret; } #define MAX_CONTIG_BYTES 1048576 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) { return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); } static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) { return ~(ocfs2_cow_contig_clusters(sb) - 1); } /* * Given an extent that starts at 'start' and an I/O that starts at 'cpos', * find an offset (start + (n * contig_clusters)) that is closest to cpos * while still being less than or equal to it. * * The goal is to break the extent at a multiple of contig_clusters. */ static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, unsigned int start, unsigned int cpos) { BUG_ON(start > cpos); return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); } /* * Given a cluster count of len, pad it out so that it is a multiple * of contig_clusters. */ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, unsigned int len) { unsigned int padded = (len + (ocfs2_cow_contig_clusters(sb) - 1)) & ocfs2_cow_contig_mask(sb); /* Did we wrap? */ if (padded < len) padded = UINT_MAX; return padded; } /* * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is virtual start cluster position we want to do CoW in a * file and write_len is the cluster length. * max_cpos is the place where we want to stop CoW intentionally. * * Normal we will start CoW from the beginning of extent record containing cpos. * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we * get good I/O from the resulting extent tree. */ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, struct ocfs2_extent_list *el, u32 cpos, u32 write_len, u32 max_cpos, u32 *cow_start, u32 *cow_len) { int ret = 0; int tree_height = le16_to_cpu(el->l_tree_depth), i; struct buffer_head *eb_bh = NULL; struct ocfs2_extent_block *eb = NULL; struct ocfs2_extent_rec *rec; unsigned int want_clusters, rec_end = 0; int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); int leaf_clusters; BUG_ON(cpos + write_len > max_cpos); if (tree_height > 0) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, "Inode %lu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; } } *cow_len = 0; for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; if (ocfs2_is_empty_extent(rec)) { mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " "index %d\n", inode->i_ino, i); continue; } if (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters) <= cpos) continue; if (*cow_len == 0) { /* * We should find a refcounted record in the * first pass. */ BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); *cow_start = le32_to_cpu(rec->e_cpos); } /* * If we encounter a hole, a non-refcounted record or * pass the max_cpos, stop the search. */ if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || (max_cpos <= le32_to_cpu(rec->e_cpos))) break; leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; if (rec_end > max_cpos) { rec_end = max_cpos; leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); } /* * How many clusters do we actually need from * this extent? First we see how many we actually * need to complete the write. If that's smaller * than contig_clusters, we try for contig_clusters. */ if (!*cow_len) want_clusters = write_len; else want_clusters = (cpos + write_len) - (*cow_start + *cow_len); if (want_clusters < contig_clusters) want_clusters = contig_clusters; /* * If the write does not cover the whole extent, we * need to calculate how we're going to split the extent. * We try to do it on contig_clusters boundaries. * * Any extent smaller than contig_clusters will be * CoWed in its entirety. */ if (leaf_clusters <= contig_clusters) *cow_len += leaf_clusters; else if (*cow_len || (*cow_start == cpos)) { /* * This extent needs to be CoW'd from its * beginning, so all we have to do is compute * how many clusters to grab. We align * want_clusters to the edge of contig_clusters * to get better I/O. */ want_clusters = ocfs2_cow_align_length(inode->i_sb, want_clusters); if (leaf_clusters < want_clusters) *cow_len += leaf_clusters; else *cow_len += want_clusters; } else if ((*cow_start + contig_clusters) >= (cpos + write_len)) { /* * Breaking off contig_clusters at the front * of the extent will cover our write. That's * easy. */ *cow_len = contig_clusters; } else if ((rec_end - cpos) <= contig_clusters) { /* * Breaking off contig_clusters at the tail of * this extent will cover cpos. */ *cow_start = rec_end - contig_clusters; *cow_len = contig_clusters; } else if ((rec_end - cpos) <= want_clusters) { /* * While we can't fit the entire write in this * extent, we know that the write goes from cpos * to the end of the extent. Break that off. * We try to break it at some multiple of * contig_clusters from the front of the extent. * Failing that (ie, cpos is within * contig_clusters of the front), we'll CoW the * entire extent. */ *cow_start = ocfs2_cow_align_start(inode->i_sb, *cow_start, cpos); *cow_len = rec_end - *cow_start; } else { /* * Ok, the entire write lives in the middle of * this extent. Let's try to slice the extent up * nicely. Optimally, our CoW region starts at * m*contig_clusters from the beginning of the * extent and goes for n*contig_clusters, * covering the entire write. */ *cow_start = ocfs2_cow_align_start(inode->i_sb, *cow_start, cpos); want_clusters = (cpos + write_len) - *cow_start; want_clusters = ocfs2_cow_align_length(inode->i_sb, want_clusters); if (*cow_start + want_clusters <= rec_end) *cow_len = want_clusters; else *cow_len = rec_end - *cow_start; } /* Have we covered our entire write yet? */ if ((*cow_start + *cow_len) >= (cpos + write_len)) break; /* * If we reach the end of the extent block and don't get enough * clusters, continue with the next extent block if possible. */ if (i + 1 == le16_to_cpu(el->l_next_free_rec) && eb && eb->h_next_leaf_blk) { brelse(eb_bh); eb_bh = NULL; ret = ocfs2_read_extent_block(INODE_CACHE(inode), le64_to_cpu(eb->h_next_leaf_blk), &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; i = -1; } } out: brelse(eb_bh); return ret; } /* * Prepare meta_ac, data_ac and calculate credits when we want to add some * num_clusters in data_tree "et" and change the refcount for the old * clusters(starting form p_cluster) in the refcount tree. * * Note: * 1. since we may split the old tree, so we at most will need num_clusters + 2 * more new leaf records. * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so * just give data_ac = NULL. */ static int ocfs2_lock_refcount_allocators(struct super_block *sb, u32 p_cluster, u32 num_clusters, struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_alloc_context **meta_ac, struct ocfs2_alloc_context **data_ac, int *credits) { int ret = 0, meta_add = 0; int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); goto out; } if (num_free_extents < num_clusters + 2) meta_add = ocfs2_extend_meta_needed(et->et_root_el); *credits += ocfs2_calc_extend_credits(sb, et->et_root_el); ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, p_cluster, num_clusters, &meta_add, credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_lock_refcount_allocators(meta_add, *credits); ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, meta_ac); if (ret) { mlog_errno(ret); goto out; } if (data_ac) { ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, data_ac); if (ret) mlog_errno(ret); } out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); *meta_ac = NULL; } } return ret; } static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) { BUG_ON(buffer_dirty(bh)); clear_buffer_mapped(bh); return 0; } int ocfs2_duplicate_clusters_by_page(handle_t *handle, struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); pgoff_t page_index; unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* * We only duplicate pages until we reach the page contains i_size - 1. * So trim 'end' to i_size. */ if (end > i_size_read(inode)) end = i_size_read(inode); while (offset < end) { struct folio *folio; page_index = offset >> PAGE_SHIFT; map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ from = offset & (PAGE_SIZE - 1); to = PAGE_SIZE; if (map_end & (PAGE_SIZE - 1)) to = map_end & (PAGE_SIZE - 1); retry: folio = __filemap_get_folio(mapping, page_index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); if (IS_ERR(folio)) { ret = PTR_ERR(folio); mlog_errno(ret); break; } /* * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty * page, so write it back. */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { if (folio_test_dirty(folio)) { folio_unlock(folio); folio_put(folio); ret = filemap_write_and_wait_range(mapping, offset, map_end - 1); goto retry; } } if (!folio_test_uptodate(folio)) { ret = block_read_full_folio(folio, ocfs2_get_block); if (ret) { mlog_errno(ret); goto unlock; } folio_lock(folio); } if (folio_buffers(folio)) { ret = walk_page_buffers(handle, folio_buffers(folio), from, to, &partial, ocfs2_clear_cow_buffer); if (ret) { mlog_errno(ret); goto unlock; } } ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 0, &new_block); folio_mark_accessed(folio); unlock: folio_unlock(folio); folio_put(folio); offset = map_end; if (ret) break; } return ret; } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *old_bh = NULL; struct buffer_head *new_bh = NULL; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); for (i = 0; i < blocks; i++, old_block++, new_block++) { new_bh = sb_getblk(osb->sb, new_block); if (new_bh == NULL) { ret = -ENOMEM; mlog_errno(ret); break; } ocfs2_set_new_buffer_uptodate(ci, new_bh); ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); if (ret) { mlog_errno(ret); break; } ret = ocfs2_journal_access(handle, ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); break; } memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); ocfs2_journal_dirty(handle, new_bh); brelse(new_bh); brelse(old_bh); new_bh = NULL; old_bh = NULL; } brelse(new_bh); brelse(old_bh); return ret; } static int ocfs2_clear_ext_refcount(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u32 p_cluster, u32 len, unsigned int ext_flags, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, index; struct ocfs2_extent_rec replace_rec; struct ocfs2_path *path = NULL; struct ocfs2_extent_list *el; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); u64 ino = ocfs2_metadata_cache_owner(et->et_ci); trace_ocfs2_clear_ext_refcount((unsigned long long)ino, cpos, len, p_cluster, ext_flags); memset(&replace_rec, 0, sizeof(replace_rec)); replace_rec.e_cpos = cpu_to_le32(cpos); replace_rec.e_leaf_clusters = cpu_to_le16(len); replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, p_cluster)); replace_rec.e_flags = ext_flags; replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; } el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ret = ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ino, cpos); goto out; } ret = ocfs2_split_extent(handle, et, path, index, &replace_rec, meta_ac, dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(path); return ret; } static int ocfs2_replace_clusters(handle_t *handle, struct ocfs2_cow_context *context, u32 cpos, u32 old, u32 new, u32 len, unsigned int ext_flags) { int ret; struct ocfs2_caching_info *ci = context->data_et.et_ci; u64 ino = ocfs2_metadata_cache_owner(ci); trace_ocfs2_replace_clusters((unsigned long long)ino, cpos, old, new, len, ext_flags); /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); goto out; } } ret = ocfs2_clear_ext_refcount(handle, &context->data_et, cpos, new, len, ext_flags, context->meta_ac, &context->dealloc); if (ret) mlog_errno(ret); out: return ret; } int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { int ret; loff_t start, end; if (ocfs2_should_order_data(inode)) return 0; start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1; ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) mlog_errno(ret); return ret; } static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) { return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, num_clusters, extent_flags); } static int ocfs2_make_clusters_writable(struct super_block *sb, struct ocfs2_cow_context *context, u32 cpos, u32 p_cluster, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; struct buffer_head *ref_leaf_bh = NULL; struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; struct ocfs2_refcount_rec rec; trace_ocfs2_make_clusters_writable(cpos, p_cluster, num_clusters, e_flags); ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, &context->data_et, ref_ci, context->ref_root_bh, &context->meta_ac, &context->data_ac, &credits); if (ret) { mlog_errno(ret); return ret; } if (context->post_refcount) credits += context->post_refcount->credits; credits += context->extra_credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } orig_num_clusters = num_clusters; while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out_commit; } BUG_ON(!rec.r_refcount); set_len = min((u64)p_cluster + num_clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - p_cluster; /* * There are many different situation here. * 1. If refcount == 1, remove the flag and don't COW. * 2. If refcount > 1, allocate clusters. * Here we may not allocate r_len once at a time, so continue * until we reach num_clusters. */ if (le32_to_cpu(rec.r_refcount) == 1) { delete = 0; ret = ocfs2_clear_ext_refcount(handle, &context->data_et, cpos, p_cluster, set_len, e_flags, context->meta_ac, &context->dealloc); if (ret) { mlog_errno(ret); goto out_commit; } } else { delete = 1; ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, set_len, &new_bit, &new_len); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_replace_clusters(handle, context, cpos, p_cluster, new_bit, new_len, e_flags); if (ret) { mlog_errno(ret); goto out_commit; } set_len = new_len; } ret = __ocfs2_decrease_refcount(handle, ref_ci, context->ref_root_bh, p_cluster, set_len, context->meta_ac, &context->dealloc, delete); if (ret) { mlog_errno(ret); goto out_commit; } cpos += set_len; p_cluster += set_len; num_clusters -= set_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } /* handle any post_cow action. */ if (context->post_refcount && context->post_refcount->func) { ret = context->post_refcount->func(context->inode, handle, context->post_refcount->para); if (ret) { mlog_errno(ret); goto out_commit; } } /* * Here we should write the new page out first if we are * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, orig_num_clusters); if (ret) mlog_errno(ret); } out_commit: ocfs2_commit_trans(osb, handle); out: if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); context->data_ac = NULL; } if (context->meta_ac) { ocfs2_free_alloc_context(context->meta_ac); context->meta_ac = NULL; } brelse(ref_leaf_bh); return ret; } static int ocfs2_replace_cow(struct ocfs2_cow_context *context) { int ret = 0; struct inode *inode = context->inode; u32 cow_start = context->cow_start, cow_len = context->cow_len; u32 p_cluster, num_clusters; unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(osb)) { return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); while (cow_len) { ret = context->get_clusters(context, cow_start, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); break; } BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); if (cow_len < num_clusters) num_clusters = cow_len; ret = ocfs2_make_clusters_writable(inode->i_sb, context, cow_start, p_cluster, num_clusters, ext_flags); if (ret) { mlog_errno(ret); break; } cow_len -= num_clusters; cow_start += num_clusters; } if (ocfs2_dealloc_has_cluster(&context->dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &context->dealloc); } return ret; } /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { int ret; u32 cow_start = 0, cow_len = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, &cow_start, &cow_len); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno, cpos, write_len, max_cpos, cow_start, cow_len); BUG_ON(cow_len == 0); context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } context->inode = inode; context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); ret = ocfs2_replace_cow(context); if (ret) mlog_errno(ret); /* * truncate the extent map here since no matter whether we meet with * any error during the action, we shouldn't trust cached extent map * any more. */ ocfs2_extent_map_trunc(inode, cow_start); ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); out: kfree(context); return ret; } /* * CoW any and all clusters between cpos and cpos+write_len. * Don't CoW past max_cpos. If this returns successfully, all * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { int ret = 0; u32 p_cluster, num_clusters; unsigned int ext_flags; while (write_len) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); break; } if (write_len < num_clusters) num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); break; } } write_len -= num_clusters; cpos += num_clusters; } return ret; } static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) { struct inode *inode = context->inode; struct ocfs2_xattr_value_root *xv = context->cow_object; return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, num_clusters, &xv->xr_list, extent_flags); } /* * Given a xattr value root, calculate the most meta/credits we need for * refcount tree change if we truncate it to 0. */ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_xattr_value_root *xv, int *meta_add, int *credits) { int ret = 0, index, ref_blocks = 0; u32 p_cluster, num_clusters; u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); struct ocfs2_refcount_block *rb; struct ocfs2_refcount_rec rec; struct buffer_head *ref_leaf_bh = NULL; while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, &xv->xr_list, NULL); if (ret) { mlog_errno(ret); goto out; } cpos += num_clusters; while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, p_cluster, num_clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } BUG_ON(!rec.r_refcount); rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; /* * We really don't know whether the other clusters is in * this refcount block or not, so just take the worst * case that all the clusters are in this block and each * one will split a refcount rec, so totally we need * clusters * 2 new refcount rec. */ if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; *credits += 1; brelse(ref_leaf_bh); ref_leaf_bh = NULL; if (num_clusters <= le32_to_cpu(rec.r_clusters)) break; else num_clusters -= le32_to_cpu(rec.r_clusters); p_cluster += num_clusters; } } *meta_add += ref_blocks; if (!ref_blocks) goto out; rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; else { struct ocfs2_extent_tree et; ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); *credits += ocfs2_calc_extend_credits(inode->i_sb, et.et_root_el); } out: brelse(ref_leaf_bh); return ret; } /* * Do CoW for xattr. */ int ocfs2_refcount_cow_xattr(struct inode *inode, struct ocfs2_dinode *di, struct ocfs2_xattr_value_buf *vb, struct ocfs2_refcount_tree *ref_tree, struct buffer_head *ref_root_bh, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post) { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, &cow_start, &cow_len); if (ret) { mlog_errno(ret); goto out; } BUG_ON(cow_len == 0); context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; mlog_errno(ret); goto out; } context->inode = inode; context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; context->ref_root_bh = ref_root_bh; context->cow_object = xv; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; /* We need the extra credits for duplicate_clusters by jbd. */ context->extra_credits = ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; context->get_clusters = ocfs2_xattr_value_get_clusters; context->post_refcount = post; ocfs2_init_xattr_value_extent_tree(&context->data_et, INODE_CACHE(inode), vb); ret = ocfs2_replace_cow(context); if (ret) mlog_errno(ret); out: kfree(context); return ret; } /* * Insert a new extent into refcount tree and mark a extent rec * as refcounted in the dinode tree. */ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_extent_tree *data_et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, u32 cpos, u32 p_cluster, u32 num_clusters, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_post_refcount *post) { int ret; handle_t *handle; int credits = 1, ref_blocks = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; /* We need to be able to handle at least an extent tree split. */ ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, &ref_blocks, &credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { ret = ocfs2_reserve_new_metadata_blocks(osb, ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); goto out; } } if (post) credits += post->credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, cpos, num_clusters, p_cluster, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, 0, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } if (post && post->func) { ret = post->func(inode, handle, post->para); if (ret) mlog_errno(ret); } out_commit: ocfs2_commit_trans(osb, handle); out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); return ret; } static int ocfs2_change_ctime(struct inode *inode, struct buffer_head *di_bh) { int ret; handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } inode_set_ctime_current(inode); di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } static int ocfs2_attach_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { int ret, data_changed = 0; struct buffer_head *ref_root_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_refcount_tree *ref_tree; unsigned int ext_flags; loff_t size; u32 cpos, num_clusters, clusters, p_cluster; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree di_et; ocfs2_init_dealloc_ctxt(&dealloc); if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); goto out; } } BUG_ON(!di->i_refcount_loc); ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) goto attach_xattr; ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); size = i_size_read(inode); clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); cpos = 0; while (cpos < clusters) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto unlock; } if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_tree->rf_ci, ref_root_bh, cpos, p_cluster, num_clusters, &dealloc, NULL); if (ret) { mlog_errno(ret); goto unlock; } data_changed = 1; } cpos += num_clusters; } attach_xattr: if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, &ref_tree->rf_ci, ref_root_bh, &dealloc); if (ret) { mlog_errno(ret); goto unlock; } } if (data_changed) { ret = ocfs2_change_ctime(inode, di_bh); if (ret) mlog_errno(ret); } unlock: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } out: /* * Empty the extent map so that we may get the right extent * record from the disk. */ ocfs2_extent_map_trunc(inode, 0); return ret; } static int ocfs2_add_refcounted_extent(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, u32 cpos, u32 p_cluster, u32 num_clusters, unsigned int ext_flags, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; handle_t *handle; int credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; ret = ocfs2_lock_refcount_allocators(inode->i_sb, p_cluster, num_clusters, et, ref_ci, ref_root_bh, &meta_ac, NULL, &credits); if (ret) { mlog_errno(ret); goto out; } handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_insert_extent(handle, et, cpos, ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } ret = dquot_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, num_clusters)); if (ret) mlog_errno(ret); out_commit: ocfs2_commit_trans(osb, handle); out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); return ret; } static int ocfs2_duplicate_inline_data(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh) { int ret; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, le16_to_cpu(s_di->id2.i_data.id_count)); spin_lock(&OCFS2_I(t_inode)->ip_lock); OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); spin_unlock(&OCFS2_I(t_inode)->ip_lock); ocfs2_journal_dirty(handle, t_bh); out_commit: ocfs2_commit_trans(osb, handle); out: return ret; } static int ocfs2_duplicate_extent_list(struct inode *s_inode, struct inode *t_inode, struct buffer_head *t_bh, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; u32 p_cluster, num_clusters, clusters, cpos; loff_t size; unsigned int ext_flags; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); size = i_size_read(s_inode); clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); cpos = 0; while (cpos < clusters) { ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto out; } if (p_cluster) { ret = ocfs2_add_refcounted_extent(t_inode, &et, ref_ci, ref_root_bh, cpos, p_cluster, num_clusters, ext_flags, dealloc); if (ret) { mlog_errno(ret); goto out; } } cpos += num_clusters; } out: return ret; } /* * change the new file's attributes to the src. * * reflink creates a snapshot of a file, that means the attributes * must be identical except for three exceptions - nlink, ino, and ctime. */ static int ocfs2_complete_reflink(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh, bool preserve) { int ret; handle_t *handle; struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; loff_t size = i_size_read(s_inode); handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); return ret; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } spin_lock(&OCFS2_I(t_inode)->ip_lock); OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; spin_unlock(&OCFS2_I(t_inode)->ip_lock); i_size_write(t_inode, size); t_inode->i_blocks = s_inode->i_blocks; di->i_xattr_inline_size = s_di->i_xattr_inline_size; di->i_clusters = s_di->i_clusters; di->i_size = s_di->i_size; di->i_dyn_features = s_di->i_dyn_features; di->i_attr = s_di->i_attr; if (preserve) { t_inode->i_uid = s_inode->i_uid; t_inode->i_gid = s_inode->i_gid; t_inode->i_mode = s_inode->i_mode; di->i_uid = s_di->i_uid; di->i_gid = s_di->i_gid; di->i_mode = s_di->i_mode; /* * update time. * we want mtime to appear identical to the source and * update ctime. */ inode_set_ctime_current(t_inode); di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode)); di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode)); inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode)); di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } ocfs2_journal_dirty(handle, t_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); return ret; } static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh, bool preserve) { int ret; struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; ocfs2_init_dealloc_ctxt(&dealloc); ret = ocfs2_set_refcount_tree(t_inode, t_bh, le64_to_cpu(di->i_refcount_loc)); if (ret) { mlog_errno(ret); goto out; } if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); if (ret) mlog_errno(ret); goto out; } ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, &dealloc); if (ret) { mlog_errno(ret); goto out_unlock_refcount; } out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); out: if (ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } return ret; } static int __ocfs2_reflink(struct dentry *old_dentry, struct buffer_head *old_bh, struct inode *new_inode, bool preserve) { int ret; struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { ret = -EINVAL; mlog_errno(ret); goto out; } ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_attach_refcount_tree(inode, old_bh); if (ret) { mlog_errno(ret); goto out; } inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { mlog_errno(ret); goto out_unlock; } if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { /* * Adjust extent record count to reserve space for extended attribute. * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). */ struct ocfs2_inode_info *new_oi = OCFS2_I(new_inode); if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && !(ocfs2_inode_is_fast_symlink(new_inode))) { struct ocfs2_dinode *new_di = (struct ocfs2_dinode *)new_bh->b_data; struct ocfs2_dinode *old_di = (struct ocfs2_dinode *)old_bh->b_data; struct ocfs2_extent_list *el = &new_di->id2.i_list; int inline_size = le16_to_cpu(old_di->i_xattr_inline_size); le16_add_cpu(&el->l_count, -(inline_size / sizeof(struct ocfs2_extent_rec))); } } ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh, preserve); if (ret) { mlog_errno(ret); goto inode_unlock; } if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); if (ret) { mlog_errno(ret); goto inode_unlock; } } ret = ocfs2_complete_reflink(inode, old_bh, new_inode, new_bh, preserve); if (ret) mlog_errno(ret); inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); if (ret) mlog_errno(ret); } return ret; } static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { int error, had_lock; struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); goto out; } error = ocfs2_rw_lock(inode, 1); if (error) { mlog_errno(error); goto out; } error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); ocfs2_rw_unlock(inode, 1); goto out; } down_write(&OCFS2_I(inode)->ip_xattr_sem); down_write(&OCFS2_I(inode)->ip_alloc_sem); error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode, preserve); up_write(&OCFS2_I(inode)->ip_alloc_sem); up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { mlog_errno(error); goto out; } had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1, &oh); if (had_lock < 0) { error = had_lock; mlog_errno(error); goto out; } /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock); out: if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we * succeed or not, so that other nodes can delete it later. */ ocfs2_open_unlock(new_orphan_inode); if (error) iput(new_orphan_inode); } return error; } /* * Below here are the bits used by OCFS2_IOC_REFLINK() to fake * sys_reflink(). This will go away when vfs_reflink() exists in * fs/namei.c. */ /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; return inode_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); } /** * ocfs2_vfs_reflink - Create a reference-counted link * * @old_dentry: source dentry + inode * @dir: directory to create the target * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { struct inode *inode = d_inode(old_dentry); int error; if (!inode) return -ENOENT; error = ocfs2_may_create(dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A reflink to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* Only regular files can be reflinked. */ if (!S_ISREG(inode->i_mode)) return -EPERM; /* * If the caller wants to preserve ownership, they require the * rights to do so. */ if (preserve) { if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN)) return -EPERM; if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) return -EPERM; } /* * If the caller is modifying any aspect of the attributes, they * are not creating a snapshot. They need read permission on the * file. */ if (!preserve) { error = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (error) return error; } inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; } /* * Most codes are copied from sys_linkat. */ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve) { struct dentry *new_dentry; struct path old_path, new_path; int error; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; error = user_path_at(AT_FDCWD, oldname, 0, &old_path); if (error) { mlog_errno(error); return error; } new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); goto out; } error = -EXDEV; if (old_path.mnt != new_path.mnt) { mlog_errno(error); goto out_dput; } error = ocfs2_vfs_reflink(old_path.dentry, d_inode(new_path.dentry), new_dentry, preserve); out_dput: done_path_create(&new_path, new_dentry); out: path_put(&old_path); return error; } /* Update destination inode size, if necessary. */ int ocfs2_reflink_update_dest(struct inode *dest, struct buffer_head *d_bh, loff_t newlen) { handle_t *handle; int ret; dest->i_blocks = ocfs2_inode_sector_count(dest); if (newlen <= i_size_read(dest)) return 0; handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); return ret; } /* Extend i_size if needed. */ spin_lock(&OCFS2_I(dest)->ip_lock); if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest)); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { mlog_errno(ret); goto out_commit; } out_commit: ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); return ret; } /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, struct buffer_head *s_bh, loff_t pos_in, struct inode *t_inode, struct buffer_head *t_bh, loff_t pos_out, loff_t len, struct ocfs2_cached_dealloc_ctxt *dealloc) { struct ocfs2_extent_tree s_et; struct ocfs2_extent_tree t_et; struct ocfs2_dinode *dis; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_super *osb; loff_t remapped_bytes = 0; loff_t pstart, plen; u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0; unsigned int ext_flags; int ret = 0; osb = OCFS2_SB(s_inode->i_sb); dis = (struct ocfs2_dinode *)s_bh->b_data; ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); while (spos < slast) { if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } /* Look up the extent. */ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto out; } num_clusters = min_t(u32, num_clusters, slast - spos); /* Punch out the dest range. */ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); if (ret) { mlog_errno(ret); goto out; } if (p_cluster == 0) goto next_loop; /* Lock the refcount btree... */ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(dis->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } /* Mark s_inode's extent as refcounted. */ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(s_inode, &s_et, &ref_tree->rf_ci, ref_root_bh, spos, p_cluster, num_clusters, dealloc, NULL); if (ret) { mlog_errno(ret); goto out_unlock_refcount; } } /* Map in the new extent. */ ext_flags |= OCFS2_EXT_REFCOUNTED; ret = ocfs2_add_refcounted_extent(t_inode, &t_et, &ref_tree->rf_ci, ref_root_bh, tpos, p_cluster, num_clusters, ext_flags, dealloc); if (ret) { mlog_errno(ret); goto out_unlock_refcount; } ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); next_loop: spos += num_clusters; tpos += num_clusters; remapped_clus += num_clusters; } goto out; out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); out: remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus); remapped_bytes = min_t(loff_t, len, remapped_bytes); return remapped_bytes > 0 ? remapped_bytes : ret; } /* Set up refcount tree and remap s_inode to t_inode. */ loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, struct buffer_head *s_bh, loff_t pos_in, struct inode *t_inode, struct buffer_head *t_bh, loff_t pos_out, loff_t len) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb; struct ocfs2_dinode *dis; struct ocfs2_dinode *dit; loff_t ret; osb = OCFS2_SB(s_inode->i_sb); dis = (struct ocfs2_dinode *)s_bh->b_data; dit = (struct ocfs2_dinode *)t_bh->b_data; ocfs2_init_dealloc_ctxt(&dealloc); /* * If we're reflinking the entire file and the source is inline * data, just copy the contents. */ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && i_size_read(t_inode) <= len && (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); if (ret) mlog_errno(ret); goto out; } /* * If both inodes belong to two different refcount groups then * forget it because we don't know how (or want) to go merging * refcount trees. */ ret = -EOPNOTSUPP; if (ocfs2_is_refcount_inode(s_inode) && ocfs2_is_refcount_inode(t_inode) && le64_to_cpu(dis->i_refcount_loc) != le64_to_cpu(dit->i_refcount_loc)) goto out; /* Neither inode has a refcount tree. Add one to s_inode. */ if (!ocfs2_is_refcount_inode(s_inode) && !ocfs2_is_refcount_inode(t_inode)) { ret = ocfs2_create_refcount_tree(s_inode, s_bh); if (ret) { mlog_errno(ret); goto out; } } /* Ensure that both inodes end up with the same refcount tree. */ if (!ocfs2_is_refcount_inode(s_inode)) { ret = ocfs2_set_refcount_tree(s_inode, s_bh, le64_to_cpu(dit->i_refcount_loc)); if (ret) { mlog_errno(ret); goto out; } } if (!ocfs2_is_refcount_inode(t_inode)) { ret = ocfs2_set_refcount_tree(t_inode, t_bh, le64_to_cpu(dis->i_refcount_loc)); if (ret) { mlog_errno(ret); goto out; } } /* Turn off inline data in the dest file. */ if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh); if (ret) { mlog_errno(ret); goto out; } } /* Actually remap extents now. */ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, pos_out, len, &dealloc); if (ret < 0) { mlog_errno(ret); goto out; } out: if (ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } return ret; } /* Lock an inode and grab a bh pointing to the inode. */ int ocfs2_reflink_inodes_lock(struct inode *s_inode, struct buffer_head **bh_s, struct inode *t_inode, struct buffer_head **bh_t) { struct inode *inode1 = s_inode; struct inode *inode2 = t_inode; struct ocfs2_inode_info *oi1; struct ocfs2_inode_info *oi2; struct buffer_head *bh1 = NULL; struct buffer_head *bh2 = NULL; bool same_inode = (s_inode == t_inode); bool need_swap = (inode1->i_ino > inode2->i_ino); int status; /* First grab the VFS and rw locks. */ lock_two_nondirectories(s_inode, t_inode); if (need_swap) swap(inode1, inode2); status = ocfs2_rw_lock(inode1, 1); if (status) { mlog_errno(status); goto out_i1; } if (!same_inode) { status = ocfs2_rw_lock(inode2, 1); if (status) { mlog_errno(status); goto out_i2; } } /* Now go for the cluster locks */ oi1 = OCFS2_I(inode1); oi2 = OCFS2_I(inode2); trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); /* We always want to lock the one with the lower lockid first. */ if (oi1->ip_blkno > oi2->ip_blkno) mlog_errno(-ENOLCK); /* lock id1 */ status = ocfs2_inode_lock_nested(inode1, &bh1, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_rw2; } /* lock id2 */ if (!same_inode) { status = ocfs2_inode_lock_nested(inode2, &bh2, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_cl1; } } else { bh2 = bh1; } /* * If we swapped inode order above, we have to swap the buffer heads * before passing them back to the caller. */ if (need_swap) swap(bh1, bh2); *bh_s = bh1; *bh_t = bh2; trace_ocfs2_double_lock_end( (unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); return 0; out_cl1: ocfs2_inode_unlock(inode1, 1); brelse(bh1); out_rw2: ocfs2_rw_unlock(inode2, 1); out_i2: ocfs2_rw_unlock(inode1, 1); out_i1: unlock_two_nondirectories(s_inode, t_inode); return status; } /* Unlock both inodes and release buffers. */ void ocfs2_reflink_inodes_unlock(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh) { ocfs2_inode_unlock(s_inode, 1); ocfs2_rw_unlock(s_inode, 1); brelse(s_bh); if (s_inode != t_inode) { ocfs2_inode_unlock(t_inode, 1); ocfs2_rw_unlock(t_inode, 1); brelse(t_bh); } unlock_two_nondirectories(s_inode, t_inode); }
104 15 76 112 59 165 107 165 64 86 86 165 2 165 5 112 165 21 165 106 106 165 164 165 5 163 105 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_H #define _BCACHEFS_JOURNAL_H /* * THE JOURNAL: * * The primary purpose of the journal is to log updates (insertions) to the * b-tree, to avoid having to do synchronous updates to the b-tree on disk. * * Without the journal, the b-tree is always internally consistent on * disk - and in fact, in the earliest incarnations bcache didn't have a journal * but did handle unclean shutdowns by doing all index updates synchronously * (with coalescing). * * Updates to interior nodes still happen synchronously and without the journal * (for simplicity) - this may change eventually but updates to interior nodes * are rare enough it's not a huge priority. * * This means the journal is relatively separate from the b-tree; it consists of * just a list of keys and journal replay consists of just redoing those * insertions in same order that they appear in the journal. * * PERSISTENCE: * * For synchronous updates (where we're waiting on the index update to hit * disk), the journal entry will be written out immediately (or as soon as * possible, if the write for the previous journal entry was still in flight). * * Synchronous updates are specified by passing a closure (@flush_cl) to * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter * down to the journalling code. That closure will wait on the journal write to * complete (via closure_wait()). * * If the index update wasn't synchronous, the journal entry will be * written out after 10 ms have elapsed, by default (the delay_ms field * in struct journal). * * JOURNAL ENTRIES: * * A journal entry is variable size (struct jset), it's got a fixed length * header and then a variable number of struct jset_entry entries. * * Journal entries are identified by monotonically increasing 64 bit sequence * numbers - jset->seq; other places in the code refer to this sequence number. * * A jset_entry entry contains one or more bkeys (which is what gets inserted * into the b-tree). We need a container to indicate which b-tree the key is * for; also, the roots of the various b-trees are stored in jset_entry entries * (one for each b-tree) - this lets us add new b-tree types without changing * the on disk format. * * We also keep some things in the journal header that are logically part of the * superblock - all the things that are frequently updated. This is for future * bcache on raw flash support; the superblock (which will become another * journal) can't be moved or wear leveled, so it contains just enough * information to find the main journal, and the superblock only has to be * rewritten when we want to move/wear level the main journal. * * JOURNAL LAYOUT ON DISK: * * The journal is written to a ringbuffer of buckets (which is kept in the * superblock); the individual buckets are not necessarily contiguous on disk * which means that journal entries are not allowed to span buckets, but also * that we can resize the journal at runtime if desired (unimplemented). * * The journal buckets exist in the same pool as all the other buckets that are * managed by the allocator and garbage collection - garbage collection marks * the journal buckets as metadata buckets. * * OPEN/DIRTY JOURNAL ENTRIES: * * Open/dirty journal entries are journal entries that contain b-tree updates * that have not yet been written out to the b-tree on disk. We have to track * which journal entries are dirty, and we also have to avoid wrapping around * the journal and overwriting old but still dirty journal entries with new * journal entries. * * On disk, this is represented with the "last_seq" field of struct jset; * last_seq is the first sequence number that journal replay has to replay. * * To avoid overwriting dirty journal entries on disk, we keep a mapping (in * journal_device->seq) of for each journal bucket, the highest sequence number * any journal entry it contains. Then, by comparing that against last_seq we * can determine whether that journal bucket contains dirty journal entries or * not. * * To track which journal entries are dirty, we maintain a fifo of refcounts * (where each entry corresponds to a specific sequence number) - when a ref * goes to 0, that journal entry is no longer dirty. * * Journalling of index updates is done at the same time as the b-tree itself is * being modified (see btree_insert_key()); when we add the key to the journal * the pending b-tree write takes a ref on the journal entry the key was added * to. If a pending b-tree write would need to take refs on multiple dirty * journal entries, it only keeps the ref on the oldest one (since a newer * journal entry will still be replayed if an older entry was dirty). * * JOURNAL FILLING UP: * * There are two ways the journal could fill up; either we could run out of * space to write to, or we could have too many open journal entries and run out * of room in the fifo of refcounts. Since those refcounts are decremented * without any locking we can't safely resize that fifo, so we handle it the * same way. * * If the journal fills up, we start flushing dirty btree nodes until we can * allocate space for a journal write again - preferentially flushing btree * nodes that are pinning the oldest journal entries first. */ #include <linux/hash.h> #include "journal_types.h" struct bch_fs; static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) { return j->buf + j->reservations.idx; } /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) { return j->pin.front; } static inline u64 journal_cur_seq(struct journal *j) { return atomic64_read(&j->seq); } static inline u64 journal_last_unwritten_seq(struct journal *j) { return j->seq_ondisk + 1; } static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { case 0: return s.buf0_count; case 1: return s.buf1_count; case 2: return s.buf2_count; case 3: return s.buf3_count; } BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; s->buf2_count += s->idx == 2; s->buf3_count += s->idx == 3; } /* * Amount of space that will be taken up by some keys in the journal (i.e. * including the jset header) */ static inline unsigned jset_u64s(unsigned u64s) { return u64s + sizeof(struct jset_entry) / sizeof(u64); } static inline int journal_entry_overhead(struct journal *j) { return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; } static inline struct jset_entry * bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) { struct jset *jset = buf->data; struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); memset(entry, 0, sizeof(*entry)); entry->u64s = cpu_to_le16(u64s); le32_add_cpu(&jset->u64s, jset_u64s(u64s)); return entry; } static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { return vstruct_idx(j->buf[res->idx].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, enum btree_id id, unsigned level, unsigned u64s) { entry->u64s = cpu_to_le16(u64s); entry->btree_id = id; entry->level = level; entry->type = type; entry->pad[0] = 0; entry->pad[1] = 0; entry->pad[2] = 0; return jset_u64s(u64s); } static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, enum btree_id id, unsigned level, const void *data, unsigned u64s) { unsigned ret = journal_entry_init(entry, type, id, level, u64s); memcpy_u64s_small(entry->_data, data, u64s); return ret; } static inline struct jset_entry * bch2_journal_add_entry(struct journal *j, struct journal_res *res, unsigned type, enum btree_id id, unsigned level, unsigned u64s) { struct jset_entry *entry = journal_res_entry(j, res); unsigned actual = journal_entry_init(entry, type, id, level, u64s); EBUG_ON(!res->ref); EBUG_ON(actual > res->u64s); res->offset += actual; res->u64s -= actual; return entry; } static inline bool journal_entry_empty(struct jset *j) { if (j->seq != j->last_seq) return false; vstruct_for_each(j, i) if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) return false; return true; } /* * Drop reference on a buffer index and return true if the count has hit zero. */ static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, .buf2_count = idx == 2, .buf3_count = idx == 3, }).v, &j->reservations.counter); return s; } bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { union journal_res_state s; s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { union journal_res_state s; s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) wake_up(&j->wait); } /* * This function releases the journal write structure so other threads can * then proceed to add their keys as well. */ static inline void bch2_journal_res_put(struct journal *j, struct journal_res *res) { if (!res->ref) return; lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); bch2_journal_buf_put(j, res->idx, res->seq); res->ref = 0; } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned, struct btree_trans *); /* First bits for BCH_WATERMARK: */ enum journal_res_flags { __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, __JOURNAL_RES_GET_CHECK, }; #define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) #define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, unsigned flags) { union journal_res_state old, new; old.v = atomic64_read(&j->reservations.counter); do { new.v = old.v; /* * Check if there is still room in the current journal * entry: */ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; EBUG_ON(!journal_state_count(new, new.idx)); if ((flags & BCH_WATERMARK_MASK) < j->watermark) return 0; new.cur_entry_offset += res->u64s; journal_state_inc(&new); /* * If the refcount would overflow, we have to wait: * XXX - tracepoint this: */ if (!journal_state_count(new, new.idx)) return 0; if (flags & JOURNAL_RES_GET_CHECK) return 1; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); res->ref = true; res->idx = old.idx; res->offset = old.cur_entry_offset; res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, unsigned u64s, unsigned flags, struct btree_trans *trans) { int ret; EBUG_ON(res->ref); EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; if (journal_res_get_fast(j, res, flags)) goto out; ret = bch2_journal_res_get_slowpath(j, res, flags, trans); if (ret) return ret; out: if (!(flags & JOURNAL_RES_GET_CHECK)) { lock_acquire_shared(&j->res_map, 0, (flags & JOURNAL_RES_GET_NONBLOCK) != 0, NULL, _THIS_IP_); EBUG_ON(!res->ref); } return 0; } /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, struct journal_entry_res *, unsigned); int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_running, &j->flags)); set_bit(JOURNAL_replay_done, &j->flags); } void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); int bch2_fs_journal_start(struct journal *, u64); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */
1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ #include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/compiler.h> #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #if defined(__LITTLE_ENDIAN) #define CPU_BYTEORDER_STRING "little" #elif defined(__BIG_ENDIAN) #define CPU_BYTEORDER_STRING "big" #else #error Unknown byteorder #endif #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); /* cpu byteorder */ static ssize_t cpu_byteorder_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); } KERNEL_ATTR_RO(cpu_byteorder); /* address bits */ static ssize_t address_bits_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); } KERNEL_ATTR_RO(address_bits); #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; memcpy(uevent_helper, buf, count); uevent_helper[count] = '\0'; if (count && uevent_helper[count-1] == '\n') uevent_helper[count-1] = '\0'; return count; } KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", prof_on); } static ssize_t profiling_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int ret; static DEFINE_MUTEX(lock); /* * We need serialization, for profile_setup() initializes prof_on * value and profile_init() must not reallocate prof_buffer after * once allocated. */ guard(mutex)(&lock); if (prof_on) return -EEXIST; /* * This eventually calls into get_option() which * has a ton of callers and is not const. It is * easiest to cast it away here. */ profile_setup((char *)buf); ret = profile_init(); if (ret) return ret; ret = create_proc_profile(); if (ret) return ret; return count; } KERNEL_ATTR_RW(profiling); #endif #ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sysfs_emit(buf, "%pa %x\n", &vmcore_base, (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", file_caps_enabled); } KERNEL_ATTR_RO(fscaps); #ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_expedited)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_expedited); int rcu_normal; static ssize_t rcu_normal_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal)); } static ssize_t rcu_normal_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_normal)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_normal); #endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ extern const void __start_notes; extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static __ro_after_init BIN_ATTR_SIMPLE_RO(notes); struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, &cpu_byteorder_attr.attr, &address_bits_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif #endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, #endif NULL }; static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; static int __init ksysfs_init(void) { int error; kernel_kobj = kobject_create_and_add("kernel", NULL); if (!kernel_kobj) { error = -ENOMEM; goto exit; } error = sysfs_create_group(kernel_kobj, &kernel_attr_group); if (error) goto kset_exit; if (notes_size > 0) { bin_attr_notes.private = (void *)&__start_notes; bin_attr_notes.size = notes_size; error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes); if (error) goto group_exit; } return 0; group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: kobject_put(kernel_kobj); exit: return error; } core_initcall(ksysfs_init);
934 938 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
8058 8057 1391 1388 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-only /* Common code for 32 and 64-bit NUMA */ #include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/of.h> #include <linux/string.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/ctype.h> #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> #include <linux/sort.h> #include <linux/numa_memblks.h> #include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/amd_nb.h> #include "numa_internal.h" int numa_off; static __init int numa_setup(char *opt) { if (!opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; if (!strncmp(opt, "fake=", 5)) return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) disable_srat(); if (!strncmp(opt, "nohmat", 6)) disable_hmat(); return 0; } early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); if (apicid != BAD_APICID) return __apicid_to_node[apicid]; return NUMA_NO_NODE; } cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* * Map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); /* early setting, no percpu area yet */ if (cpu_to_node_map) { cpu_to_node_map[cpu] = node; return; } #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; } #endif per_cpu(x86_cpu_to_node_map, cpu) = node; set_cpu_numa_node(cpu, node); } void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) { unsigned int node; /* setup nr_node_ids if not done yet */ if (nr_node_ids == MAX_NUMNODES) setup_nr_node_ids(); /* allocate the map */ for (node = 0; node < nr_node_ids; node++) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init numa_register_nodes(void) { int nid; if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { unsigned long start_pfn, end_pfn; /* * Note, get_pfn_range_for_nid() depends on * memblock_set_node() having already happened */ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); if (start_pfn >= end_pfn) continue; alloc_node_data(nid); node_set_online(nid); } /* Dump memblock with node info and return. */ memblock_dump_all(); return 0; } /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node * mapping. To avoid this fill in the mapping for all possible CPUs, * as the number of CPUs is not known yet. We round robin the existing * nodes. */ static void __init numa_init_array(void) { int rr, i; rr = first_node(node_online_map); for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); rr = next_node_in(rr, node_online_map); } } static int __init numa_init(int (*init_func)(void)) { int i; int ret; for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE); ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); if (ret < 0) return ret; ret = numa_register_nodes(); if (ret < 0) return ret; for (i = 0; i < nr_cpu_ids; i++) { int nid = early_cpu_to_node(i); if (nid == NUMA_NO_NODE) continue; if (!node_online(nid)) numa_clear_node(i); } numa_init_array(); return 0; } /** * dummy_numa_init - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node and add memory blocks that cover all * allowed memory. This function must not fail. */ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; } /** * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) { if (!numa_off) { #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; #endif #ifdef CONFIG_AMD_NUMA if (!numa_init(amd_numa_init)) return; #endif if (acpi_disabled && !numa_init(of_numa_init)) return; } numa_init(dummy_numa_init); } /* * A node may exist which has one or more Generic Initiators but no CPUs and no * memory. * * This function must be called after init_cpu_to_node(), to ensure that any * memoryless CPU nodes have already been brought online, and before the * node_data[nid] is needed for zone list setup in build_all_zonelists(). * * When this function is called, any nodes containing either memory and/or CPUs * will already be online and there is no need to do anything extra, even if * they also contain one or more Generic Initiators. */ void __init init_gi_nodes(void) { int nid; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_one_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ for_each_node_state(nid, N_GENERIC_INITIATOR) if (!node_online(nid)) node_set_online(nid); } /* * Setup early cpu_to_node. * * Populate cpu_to_node[] only if x86_cpu_to_apicid[], * and apicid_to_node[] tables have valid entries for a CPU. * This means we skip cpu_to_node[] initialisation for NUMA * emulation and faking node case (when running a kernel compiled * for NUMA on a non NUMA box), which is OK as cpu_to_node[] * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. * * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { int cpu; u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); BUG_ON(cpu_to_apicid == NULL); for_each_possible_cpu(cpu) { int node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) continue; /* * Exclude this node from * bringup_nonboot_cpus * cpu_up * __try_online_node * register_one_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ if (!node_online(node)) node_set_online(node); numa_set_node(cpu, node); } } #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU void numa_add_cpu(unsigned int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } void numa_remove_cpu(unsigned int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } # endif /* !CONFIG_NUMA_EMU */ #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ int __cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) { printk(KERN_WARNING "cpu_to_node(%d): usage too early!\n", cpu); dump_stack(); return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; } return per_cpu(x86_cpu_to_node_map, cpu); } EXPORT_SYMBOL(__cpu_to_node); /* * Same function as cpu_to_node() but used if called before the * per_cpu areas are setup. */ int early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); return NUMA_NO_NODE; } return per_cpu(x86_cpu_to_node_map, cpu); } void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) { struct cpumask *mask; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ return; } mask = node_to_cpumask_map[node]; if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, cpumask_pr_args(mask)); return; } # ifndef CONFIG_NUMA_EMU static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void numa_add_cpu(unsigned int cpu) { numa_set_cpumask(cpu, true); } void numa_remove_cpu(unsigned int cpu) { numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ const struct cpumask *cpumask_of_node(int node) { if ((unsigned)node >= nr_node_ids) { printk(KERN_WARNING "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; } if (!cpumask_available(node_to_cpumask_map[node])) { printk(KERN_WARNING "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return cpu_online_mask; } return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ #ifdef CONFIG_NUMA_EMU void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids) { int i, j; /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall * back to zero just in case. */ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { if (__apicid_to_node[i] == NUMA_NO_NODE) continue; for (j = 0; j < nr_emu_nids; j++) if (__apicid_to_node[i] == emu_nid_to_phys[j]) break; __apicid_to_node[i] = j < nr_emu_nids ? j : 0; } } u64 __init numa_emu_dma_end(void) { return PFN_PHYS(MAX_DMA32_PFN); } #endif /* CONFIG_NUMA_EMU */
10 10 10 10 5 5 9 10 10 10 10 9 10 10 10 10 10 10 10 10 10 8 10 2 2 1 2 2 2 2 2 2 2 10 9 10 10 9 10 10 9 10 10 9 8 9 10 10 5 5 5 5 5 5 5 5 5 5 10 10 10 5 5 5 5 5 5 2 5 5 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 2 2 2 2 10 10 10 9 10 1 1 1 1 1 1 1 1 2 11 11 3 9 6 4 2 2 4 3 1 2 1 10 10 10 9 10 9 10 10 5 5 5 4 10 2 1 4 4 1 4 4 3 3 3 3 3 3 1 2 2 4 3 1 4 2 2 2 2 2 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 /* * videobuf2-core.c - video buffer 2 core framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * Marek Szyprowski <m.szyprowski@samsung.com> * * The vb2_thread implementation was based on code from videobuf-dvb.c: * (c) 2004 Gerd Knorr <kraxel@bytesex.org> [SUSE Labs] * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <media/videobuf2-core.h> #include <media/v4l2-mc.h> #include <trace/events/vb2.h> #define PLANE_INDEX_BITS 3 #define PLANE_INDEX_SHIFT (PAGE_SHIFT + PLANE_INDEX_BITS) #define PLANE_INDEX_MASK (BIT_MASK(PLANE_INDEX_BITS) - 1) #define MAX_BUFFER_INDEX BIT_MASK(30 - PLANE_INDEX_SHIFT) #define BUFFER_INDEX_MASK (MAX_BUFFER_INDEX - 1) #if BIT(PLANE_INDEX_BITS) != VIDEO_MAX_PLANES #error PLANE_INDEX_BITS order must be equal to VIDEO_MAX_PLANES #endif static int debug; module_param(debug, int, 0644); #define dprintk(q, level, fmt, arg...) \ do { \ if (debug >= level) \ pr_info("[%s] %s: " fmt, (q)->name, __func__, \ ## arg); \ } while (0) #ifdef CONFIG_VIDEO_ADV_DEBUG /* * If advanced debugging is on, then count how often each op is called * successfully, which can either be per-buffer or per-queue. * * This makes it easy to check that the 'init' and 'cleanup' * (and variations thereof) stay balanced. */ #define log_memop(vb, op) \ dprintk((vb)->vb2_queue, 2, "call_memop(%d, %s)%s\n", \ (vb)->index, #op, \ (vb)->vb2_queue->mem_ops->op ? "" : " (nop)") #define call_memop(vb, op, args...) \ ({ \ struct vb2_queue *_q = (vb)->vb2_queue; \ int err; \ \ log_memop(vb, op); \ err = _q->mem_ops->op ? _q->mem_ops->op(args) : 0; \ if (!err) \ (vb)->cnt_mem_ ## op++; \ err; \ }) #define call_ptr_memop(op, vb, args...) \ ({ \ struct vb2_queue *_q = (vb)->vb2_queue; \ void *ptr; \ \ log_memop(vb, op); \ ptr = _q->mem_ops->op ? _q->mem_ops->op(vb, args) : NULL; \ if (!IS_ERR_OR_NULL(ptr)) \ (vb)->cnt_mem_ ## op++; \ ptr; \ }) #define call_void_memop(vb, op, args...) \ ({ \ struct vb2_queue *_q = (vb)->vb2_queue; \ \ log_memop(vb, op); \ if (_q->mem_ops->op) \ _q->mem_ops->op(args); \ (vb)->cnt_mem_ ## op++; \ }) #define log_qop(q, op) \ dprintk(q, 2, "call_qop(%s)%s\n", #op, \ (q)->ops->op ? "" : " (nop)") #define call_qop(q, op, args...) \ ({ \ int err; \ \ log_qop(q, op); \ err = (q)->ops->op ? (q)->ops->op(args) : 0; \ if (!err) \ (q)->cnt_ ## op++; \ err; \ }) #define call_void_qop(q, op, args...) \ ({ \ log_qop(q, op); \ if ((q)->ops->op) \ (q)->ops->op(args); \ (q)->cnt_ ## op++; \ }) #define log_vb_qop(vb, op, args...) \ dprintk((vb)->vb2_queue, 2, "call_vb_qop(%d, %s)%s\n", \ (vb)->index, #op, \ (vb)->vb2_queue->ops->op ? "" : " (nop)") #define call_vb_qop(vb, op, args...) \ ({ \ int err; \ \ log_vb_qop(vb, op); \ err = (vb)->vb2_queue->ops->op ? \ (vb)->vb2_queue->ops->op(args) : 0; \ if (!err) \ (vb)->cnt_ ## op++; \ err; \ }) #define call_void_vb_qop(vb, op, args...) \ ({ \ log_vb_qop(vb, op); \ if ((vb)->vb2_queue->ops->op) \ (vb)->vb2_queue->ops->op(args); \ (vb)->cnt_ ## op++; \ }) #else #define call_memop(vb, op, args...) \ ((vb)->vb2_queue->mem_ops->op ? \ (vb)->vb2_queue->mem_ops->op(args) : 0) #define call_ptr_memop(op, vb, args...) \ ((vb)->vb2_queue->mem_ops->op ? \ (vb)->vb2_queue->mem_ops->op(vb, args) : NULL) #define call_void_memop(vb, op, args...) \ do { \ if ((vb)->vb2_queue->mem_ops->op) \ (vb)->vb2_queue->mem_ops->op(args); \ } while (0) #define call_qop(q, op, args...) \ ((q)->ops->op ? (q)->ops->op(args) : 0) #define call_void_qop(q, op, args...) \ do { \ if ((q)->ops->op) \ (q)->ops->op(args); \ } while (0) #define call_vb_qop(vb, op, args...) \ ((vb)->vb2_queue->ops->op ? (vb)->vb2_queue->ops->op(args) : 0) #define call_void_vb_qop(vb, op, args...) \ do { \ if ((vb)->vb2_queue->ops->op) \ (vb)->vb2_queue->ops->op(args); \ } while (0) #endif #define call_bufop(q, op, args...) \ ({ \ int ret = 0; \ if (q && q->buf_ops && q->buf_ops->op) \ ret = q->buf_ops->op(args); \ ret; \ }) #define call_void_bufop(q, op, args...) \ ({ \ if (q && q->buf_ops && q->buf_ops->op) \ q->buf_ops->op(args); \ }) static void __vb2_queue_cancel(struct vb2_queue *q); static const char *vb2_state_name(enum vb2_buffer_state s) { static const char * const state_names[] = { [VB2_BUF_STATE_DEQUEUED] = "dequeued", [VB2_BUF_STATE_IN_REQUEST] = "in request", [VB2_BUF_STATE_PREPARING] = "preparing", [VB2_BUF_STATE_QUEUED] = "queued", [VB2_BUF_STATE_ACTIVE] = "active", [VB2_BUF_STATE_DONE] = "done", [VB2_BUF_STATE_ERROR] = "error", }; if ((unsigned int)(s) < ARRAY_SIZE(state_names)) return state_names[s]; return "unknown"; } /* * __vb2_buf_mem_alloc() - allocate video memory for the given buffer */ static int __vb2_buf_mem_alloc(struct vb2_buffer *vb) { struct vb2_queue *q = vb->vb2_queue; void *mem_priv; int plane; int ret = -ENOMEM; /* * Allocate memory for all planes in this buffer * NOTE: mmapped areas should be page aligned */ for (plane = 0; plane < vb->num_planes; ++plane) { /* Memops alloc requires size to be page aligned. */ unsigned long size = PAGE_ALIGN(vb->planes[plane].length); /* Did it wrap around? */ if (size < vb->planes[plane].length) goto free; mem_priv = call_ptr_memop(alloc, vb, q->alloc_devs[plane] ? : q->dev, size); if (IS_ERR_OR_NULL(mem_priv)) { if (mem_priv) ret = PTR_ERR(mem_priv); goto free; } /* Associate allocator private data with this plane */ vb->planes[plane].mem_priv = mem_priv; } return 0; free: /* Free already allocated memory if one of the allocations failed */ for (; plane > 0; --plane) { call_void_memop(vb, put, vb->planes[plane - 1].mem_priv); vb->planes[plane - 1].mem_priv = NULL; } return ret; } /* * __vb2_buf_mem_free() - free memory of the given buffer */ static void __vb2_buf_mem_free(struct vb2_buffer *vb) { unsigned int plane; for (plane = 0; plane < vb->num_planes; ++plane) { call_void_memop(vb, put, vb->planes[plane].mem_priv); vb->planes[plane].mem_priv = NULL; dprintk(vb->vb2_queue, 3, "freed plane %d of buffer %d\n", plane, vb->index); } } /* * __vb2_buf_userptr_put() - release userspace memory associated with * a USERPTR buffer */ static void __vb2_buf_userptr_put(struct vb2_buffer *vb) { unsigned int plane; for (plane = 0; plane < vb->num_planes; ++plane) { if (vb->planes[plane].mem_priv) call_void_memop(vb, put_userptr, vb->planes[plane].mem_priv); vb->planes[plane].mem_priv = NULL; } } /* * __vb2_plane_dmabuf_put() - release memory associated with * a DMABUF shared plane */ static void __vb2_plane_dmabuf_put(struct vb2_buffer *vb, struct vb2_plane *p) { if (!p->mem_priv) return; if (!p->dbuf_duplicated) { if (p->dbuf_mapped) call_void_memop(vb, unmap_dmabuf, p->mem_priv); call_void_memop(vb, detach_dmabuf, p->mem_priv); } dma_buf_put(p->dbuf); p->mem_priv = NULL; p->dbuf = NULL; p->dbuf_mapped = 0; p->bytesused = 0; p->length = 0; p->m.fd = 0; p->data_offset = 0; p->dbuf_duplicated = false; } /* * __vb2_buf_dmabuf_put() - release memory associated with * a DMABUF shared buffer */ static void __vb2_buf_dmabuf_put(struct vb2_buffer *vb) { int plane; /* * When multiple planes share the same DMA buffer attachment, the plane * with the lowest index owns the mem_priv. * Put planes in the reversed order so that we don't leave invalid * mem_priv behind. */ for (plane = vb->num_planes - 1; plane >= 0; --plane) __vb2_plane_dmabuf_put(vb, &vb->planes[plane]); } /* * __vb2_buf_mem_prepare() - call ->prepare() on buffer's private memory * to sync caches */ static void __vb2_buf_mem_prepare(struct vb2_buffer *vb) { unsigned int plane; if (vb->synced) return; vb->synced = 1; for (plane = 0; plane < vb->num_planes; ++plane) call_void_memop(vb, prepare, vb->planes[plane].mem_priv); } /* * __vb2_buf_mem_finish() - call ->finish on buffer's private memory * to sync caches */ static void __vb2_buf_mem_finish(struct vb2_buffer *vb) { unsigned int plane; if (!vb->synced) return; vb->synced = 0; for (plane = 0; plane < vb->num_planes; ++plane) call_void_memop(vb, finish, vb->planes[plane].mem_priv); } /* * __setup_offsets() - setup unique offsets ("cookies") for every plane in * the buffer. */ static void __setup_offsets(struct vb2_buffer *vb) { struct vb2_queue *q = vb->vb2_queue; unsigned int plane; unsigned long offset = 0; /* * The offset "cookie" value has the following constraints: * - a buffer can have up to 8 planes. * - v4l2 mem2mem uses bit 30 to distinguish between * OUTPUT (aka "source", bit 30 is 0) and * CAPTURE (aka "destination", bit 30 is 1) buffers. * - must be page aligned * That led to this bit mapping when PAGE_SHIFT = 12: * |30 |29 15|14 12|11 0| * |DST_QUEUE_OFF_BASE|buffer index|plane index| 0 | * where there are 15 bits to store the buffer index. * Depending on PAGE_SHIFT value we can have fewer bits * to store the buffer index. */ offset = vb->index << PLANE_INDEX_SHIFT; for (plane = 0; plane < vb->num_planes; ++plane) { vb->planes[plane].m.offset = offset + (plane << PAGE_SHIFT); dprintk(q, 3, "buffer %d, plane %d offset 0x%08lx\n", vb->index, plane, offset); } } static void init_buffer_cache_hints(struct vb2_queue *q, struct vb2_buffer *vb) { /* * DMA exporter should take care of cache syncs, so we can avoid * explicit ->prepare()/->finish() syncs. For other ->memory types * we always need ->prepare() or/and ->finish() cache sync. */ if (q->memory == VB2_MEMORY_DMABUF) { vb->skip_cache_sync_on_finish = 1; vb->skip_cache_sync_on_prepare = 1; return; } /* * ->finish() cache sync can be avoided when queue direction is * TO_DEVICE. */ if (q->dma_dir == DMA_TO_DEVICE) vb->skip_cache_sync_on_finish = 1; } /** * vb2_queue_add_buffer() - add a buffer to a queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * @vb: pointer to &struct vb2_buffer to be added to the queue. * @index: index where add vb2_buffer in the queue */ static void vb2_queue_add_buffer(struct vb2_queue *q, struct vb2_buffer *vb, unsigned int index) { WARN_ON(index >= q->max_num_buffers || test_bit(index, q->bufs_bitmap) || vb->vb2_queue); q->bufs[index] = vb; vb->index = index; vb->vb2_queue = q; set_bit(index, q->bufs_bitmap); } /** * vb2_queue_remove_buffer() - remove a buffer from a queue * @vb: pointer to &struct vb2_buffer to be removed from the queue. */ static void vb2_queue_remove_buffer(struct vb2_buffer *vb) { clear_bit(vb->index, vb->vb2_queue->bufs_bitmap); vb->vb2_queue->bufs[vb->index] = NULL; vb->vb2_queue = NULL; } /* * __vb2_queue_alloc() - allocate vb2 buffer structures and (for MMAP type) * video buffer memory for all buffers/planes on the queue and initializes the * queue * @first_index: index of the first created buffer, all newly allocated buffers * have indices in the range [first_index..first_index+count-1] * * Returns the number of buffers successfully allocated. */ static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory, unsigned int num_buffers, unsigned int num_planes, const unsigned int plane_sizes[VB2_MAX_PLANES], unsigned int *first_index) { unsigned int buffer, plane; struct vb2_buffer *vb; unsigned long index = q->max_num_buffers; int ret; /* * Ensure that the number of already queue + the number of buffers already * in the queue is below q->max_num_buffers */ num_buffers = min_t(unsigned int, num_buffers, q->max_num_buffers - vb2_get_num_buffers(q)); while (num_buffers) { index = bitmap_find_next_zero_area(q->bufs_bitmap, q->max_num_buffers, 0, num_buffers, 0); if (index < q->max_num_buffers) break; /* Try to find free space for less buffers */ num_buffers--; } /* If there is no space left to allocate buffers return 0 to indicate the error */ if (!num_buffers) { *first_index = 0; return 0; } *first_index = index; for (buffer = 0; buffer < num_buffers; ++buffer) { /* Allocate vb2 buffer structures */ vb = kzalloc(q->buf_struct_size, GFP_KERNEL); if (!vb) { dprintk(q, 1, "memory alloc for buffer struct failed\n"); break; } vb->state = VB2_BUF_STATE_DEQUEUED; vb->num_planes = num_planes; vb->type = q->type; vb->memory = memory; init_buffer_cache_hints(q, vb); for (plane = 0; plane < num_planes; ++plane) { vb->planes[plane].length = plane_sizes[plane]; vb->planes[plane].min_length = plane_sizes[plane]; } vb2_queue_add_buffer(q, vb, index++); call_void_bufop(q, init_buffer, vb); /* Allocate video buffer memory for the MMAP type */ if (memory == VB2_MEMORY_MMAP) { ret = __vb2_buf_mem_alloc(vb); if (ret) { dprintk(q, 1, "failed allocating memory for buffer %d\n", buffer); vb2_queue_remove_buffer(vb); kfree(vb); break; } __setup_offsets(vb); /* * Call the driver-provided buffer initialization * callback, if given. An error in initialization * results in queue setup failure. */ ret = call_vb_qop(vb, buf_init, vb); if (ret) { dprintk(q, 1, "buffer %d %p initialization failed\n", buffer, vb); __vb2_buf_mem_free(vb); vb2_queue_remove_buffer(vb); kfree(vb); break; } } } dprintk(q, 3, "allocated %d buffers, %d plane(s) each\n", buffer, num_planes); return buffer; } /* * __vb2_free_mem() - release video buffer memory for a given range of * buffers in a given queue */ static void __vb2_free_mem(struct vb2_queue *q, unsigned int start, unsigned int count) { unsigned int i; struct vb2_buffer *vb; for (i = start; i < start + count; i++) { vb = vb2_get_buffer(q, i); if (!vb) continue; /* Free MMAP buffers or release USERPTR buffers */ if (q->memory == VB2_MEMORY_MMAP) __vb2_buf_mem_free(vb); else if (q->memory == VB2_MEMORY_DMABUF) __vb2_buf_dmabuf_put(vb); else __vb2_buf_userptr_put(vb); } } /* * __vb2_queue_free() - free @count buffers from @start index of the queue - video memory and * related information, if no buffers are left return the queue to an * uninitialized state. Might be called even if the queue has already been freed. */ static void __vb2_queue_free(struct vb2_queue *q, unsigned int start, unsigned int count) { unsigned int i; lockdep_assert_held(&q->mmap_lock); /* Call driver-provided cleanup function for each buffer, if provided */ for (i = start; i < start + count; i++) { struct vb2_buffer *vb = vb2_get_buffer(q, i); if (vb && vb->planes[0].mem_priv) call_void_vb_qop(vb, buf_cleanup, vb); } /* Release video buffer memory */ __vb2_free_mem(q, start, count); #ifdef CONFIG_VIDEO_ADV_DEBUG /* * Check that all the calls were balanced during the life-time of this * queue. If not then dump the counters to the kernel log. */ if (vb2_get_num_buffers(q)) { bool unbalanced = q->cnt_start_streaming != q->cnt_stop_streaming || q->cnt_prepare_streaming != q->cnt_unprepare_streaming || q->cnt_wait_prepare != q->cnt_wait_finish; if (unbalanced) { pr_info("unbalanced counters for queue %p:\n", q); if (q->cnt_start_streaming != q->cnt_stop_streaming) pr_info(" setup: %u start_streaming: %u stop_streaming: %u\n", q->cnt_queue_setup, q->cnt_start_streaming, q->cnt_stop_streaming); if (q->cnt_prepare_streaming != q->cnt_unprepare_streaming) pr_info(" prepare_streaming: %u unprepare_streaming: %u\n", q->cnt_prepare_streaming, q->cnt_unprepare_streaming); if (q->cnt_wait_prepare != q->cnt_wait_finish) pr_info(" wait_prepare: %u wait_finish: %u\n", q->cnt_wait_prepare, q->cnt_wait_finish); } q->cnt_queue_setup = 0; q->cnt_wait_prepare = 0; q->cnt_wait_finish = 0; q->cnt_prepare_streaming = 0; q->cnt_start_streaming = 0; q->cnt_stop_streaming = 0; q->cnt_unprepare_streaming = 0; } for (i = start; i < start + count; i++) { struct vb2_buffer *vb = vb2_get_buffer(q, i); bool unbalanced; if (!vb) continue; unbalanced = vb->cnt_mem_alloc != vb->cnt_mem_put || vb->cnt_mem_prepare != vb->cnt_mem_finish || vb->cnt_mem_get_userptr != vb->cnt_mem_put_userptr || vb->cnt_mem_attach_dmabuf != vb->cnt_mem_detach_dmabuf || vb->cnt_mem_map_dmabuf != vb->cnt_mem_unmap_dmabuf || vb->cnt_buf_queue != vb->cnt_buf_done || vb->cnt_buf_prepare != vb->cnt_buf_finish || vb->cnt_buf_init != vb->cnt_buf_cleanup; if (unbalanced) { pr_info("unbalanced counters for queue %p, buffer %d:\n", q, i); if (vb->cnt_buf_init != vb->cnt_buf_cleanup) pr_info(" buf_init: %u buf_cleanup: %u\n", vb->cnt_buf_init, vb->cnt_buf_cleanup); if (vb->cnt_buf_prepare != vb->cnt_buf_finish) pr_info(" buf_prepare: %u buf_finish: %u\n", vb->cnt_buf_prepare, vb->cnt_buf_finish); if (vb->cnt_buf_queue != vb->cnt_buf_done) pr_info(" buf_out_validate: %u buf_queue: %u buf_done: %u buf_request_complete: %u\n", vb->cnt_buf_out_validate, vb->cnt_buf_queue, vb->cnt_buf_done, vb->cnt_buf_request_complete); if (vb->cnt_mem_alloc != vb->cnt_mem_put) pr_info(" alloc: %u put: %u\n", vb->cnt_mem_alloc, vb->cnt_mem_put); if (vb->cnt_mem_prepare != vb->cnt_mem_finish) pr_info(" prepare: %u finish: %u\n", vb->cnt_mem_prepare, vb->cnt_mem_finish); if (vb->cnt_mem_get_userptr != vb->cnt_mem_put_userptr) pr_info(" get_userptr: %u put_userptr: %u\n", vb->cnt_mem_get_userptr, vb->cnt_mem_put_userptr); if (vb->cnt_mem_attach_dmabuf != vb->cnt_mem_detach_dmabuf) pr_info(" attach_dmabuf: %u detach_dmabuf: %u\n", vb->cnt_mem_attach_dmabuf, vb->cnt_mem_detach_dmabuf); if (vb->cnt_mem_map_dmabuf != vb->cnt_mem_unmap_dmabuf) pr_info(" map_dmabuf: %u unmap_dmabuf: %u\n", vb->cnt_mem_map_dmabuf, vb->cnt_mem_unmap_dmabuf); pr_info(" get_dmabuf: %u num_users: %u\n", vb->cnt_mem_get_dmabuf, vb->cnt_mem_num_users); } } #endif /* Free vb2 buffers */ for (i = start; i < start + count; i++) { struct vb2_buffer *vb = vb2_get_buffer(q, i); if (!vb) continue; vb2_queue_remove_buffer(vb); kfree(vb); } if (!vb2_get_num_buffers(q)) { q->memory = VB2_MEMORY_UNKNOWN; INIT_LIST_HEAD(&q->queued_list); } } bool vb2_buffer_in_use(struct vb2_queue *q, struct vb2_buffer *vb) { unsigned int plane; for (plane = 0; plane < vb->num_planes; ++plane) { void *mem_priv = vb->planes[plane].mem_priv; /* * If num_users() has not been provided, call_memop * will return 0, apparently nobody cares about this * case anyway. If num_users() returns more than 1, * we are not the only user of the plane's memory. */ if (mem_priv && call_memop(vb, num_users, mem_priv) > 1) return true; } return false; } EXPORT_SYMBOL(vb2_buffer_in_use); /* * __buffers_in_use() - return true if any buffers on the queue are in use and * the queue cannot be freed (by the means of REQBUFS(0)) call */ static bool __buffers_in_use(struct vb2_queue *q) { unsigned int buffer; for (buffer = 0; buffer < q->max_num_buffers; ++buffer) { struct vb2_buffer *vb = vb2_get_buffer(q, buffer); if (!vb) continue; if (vb2_buffer_in_use(q, vb)) return true; } return false; } void vb2_core_querybuf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb) { call_void_bufop(q, fill_user_buffer, vb, pb); } EXPORT_SYMBOL_GPL(vb2_core_querybuf); /* * __verify_userptr_ops() - verify that all memory operations required for * USERPTR queue type have been provided */ static int __verify_userptr_ops(struct vb2_queue *q) { if (!(q->io_modes & VB2_USERPTR) || !q->mem_ops->get_userptr || !q->mem_ops->put_userptr) return -EINVAL; return 0; } /* * __verify_mmap_ops() - verify that all memory operations required for * MMAP queue type have been provided */ static int __verify_mmap_ops(struct vb2_queue *q) { if (!(q->io_modes & VB2_MMAP) || !q->mem_ops->alloc || !q->mem_ops->put || !q->mem_ops->mmap) return -EINVAL; return 0; } /* * __verify_dmabuf_ops() - verify that all memory operations required for * DMABUF queue type have been provided */ static int __verify_dmabuf_ops(struct vb2_queue *q) { if (!(q->io_modes & VB2_DMABUF) || !q->mem_ops->attach_dmabuf || !q->mem_ops->detach_dmabuf || !q->mem_ops->map_dmabuf || !q->mem_ops->unmap_dmabuf) return -EINVAL; return 0; } int vb2_verify_memory_type(struct vb2_queue *q, enum vb2_memory memory, unsigned int type) { if (memory != VB2_MEMORY_MMAP && memory != VB2_MEMORY_USERPTR && memory != VB2_MEMORY_DMABUF) { dprintk(q, 1, "unsupported memory type\n"); return -EINVAL; } if (type != q->type) { dprintk(q, 1, "requested type is incorrect\n"); return -EINVAL; } /* * Make sure all the required memory ops for given memory type * are available. */ if (memory == VB2_MEMORY_MMAP && __verify_mmap_ops(q)) { dprintk(q, 1, "MMAP for current setup unsupported\n"); return -EINVAL; } if (memory == VB2_MEMORY_USERPTR && __verify_userptr_ops(q)) { dprintk(q, 1, "USERPTR for current setup unsupported\n"); return -EINVAL; } if (memory == VB2_MEMORY_DMABUF && __verify_dmabuf_ops(q)) { dprintk(q, 1, "DMABUF for current setup unsupported\n"); return -EINVAL; } /* * Place the busy tests at the end: -EBUSY can be ignored when * create_bufs is called with count == 0, but count == 0 should still * do the memory and type validation. */ if (vb2_fileio_is_active(q)) { dprintk(q, 1, "file io in progress\n"); return -EBUSY; } return 0; } EXPORT_SYMBOL(vb2_verify_memory_type); static void set_queue_coherency(struct vb2_queue *q, bool non_coherent_mem) { q->non_coherent_mem = 0; if (!vb2_queue_allows_cache_hints(q)) return; q->non_coherent_mem = non_coherent_mem; } static bool verify_coherency_flags(struct vb2_queue *q, bool non_coherent_mem) { if (non_coherent_mem != q->non_coherent_mem) { dprintk(q, 1, "memory coherency model mismatch\n"); return false; } return true; } static int vb2_core_allocated_buffers_storage(struct vb2_queue *q) { if (!q->bufs) q->bufs = kcalloc(q->max_num_buffers, sizeof(*q->bufs), GFP_KERNEL); if (!q->bufs) return -ENOMEM; if (!q->bufs_bitmap) q->bufs_bitmap = bitmap_zalloc(q->max_num_buffers, GFP_KERNEL); if (!q->bufs_bitmap) { kfree(q->bufs); q->bufs = NULL; return -ENOMEM; } return 0; } static void vb2_core_free_buffers_storage(struct vb2_queue *q) { kfree(q->bufs); q->bufs = NULL; bitmap_free(q->bufs_bitmap); q->bufs_bitmap = NULL; } int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int flags, unsigned int *count) { unsigned int num_buffers, allocated_buffers, num_planes = 0; unsigned int q_num_bufs = vb2_get_num_buffers(q); unsigned plane_sizes[VB2_MAX_PLANES] = { }; bool non_coherent_mem = flags & V4L2_MEMORY_FLAG_NON_COHERENT; unsigned int i, first_index; int ret = 0; if (q->streaming) { dprintk(q, 1, "streaming active\n"); return -EBUSY; } if (q->waiting_in_dqbuf && *count) { dprintk(q, 1, "another dup()ped fd is waiting for a buffer\n"); return -EBUSY; } if (*count == 0 || q_num_bufs != 0 || (q->memory != VB2_MEMORY_UNKNOWN && q->memory != memory) || !verify_coherency_flags(q, non_coherent_mem)) { /* * We already have buffers allocated, so first check if they * are not in use and can be freed. */ mutex_lock(&q->mmap_lock); if (debug && q->memory == VB2_MEMORY_MMAP && __buffers_in_use(q)) dprintk(q, 1, "memory in use, orphaning buffers\n"); /* * Call queue_cancel to clean up any buffers in the * QUEUED state which is possible if buffers were prepared or * queued without ever calling STREAMON. */ __vb2_queue_cancel(q); __vb2_queue_free(q, 0, q->max_num_buffers); mutex_unlock(&q->mmap_lock); q->is_busy = 0; /* * In case of REQBUFS(0) return immediately without calling * driver's queue_setup() callback and allocating resources. */ if (*count == 0) return 0; } /* * Make sure the requested values and current defaults are sane. */ num_buffers = max_t(unsigned int, *count, q->min_reqbufs_allocation); num_buffers = min_t(unsigned int, num_buffers, q->max_num_buffers); memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); /* * Set this now to ensure that drivers see the correct q->memory value * in the queue_setup op. */ mutex_lock(&q->mmap_lock); ret = vb2_core_allocated_buffers_storage(q); q->memory = memory; mutex_unlock(&q->mmap_lock); if (ret) return ret; set_queue_coherency(q, non_coherent_mem); /* * Ask the driver how many buffers and planes per buffer it requires. * Driver also sets the size and allocator context for each plane. */ ret = call_qop(q, queue_setup, q, &num_buffers, &num_planes, plane_sizes, q->alloc_devs); if (ret) goto error; /* Check that driver has set sane values */ if (WARN_ON(!num_planes)) { ret = -EINVAL; goto error; } for (i = 0; i < num_planes; i++) if (WARN_ON(!plane_sizes[i])) { ret = -EINVAL; goto error; } /* Finally, allocate buffers and video memory */ allocated_buffers = __vb2_queue_alloc(q, memory, num_buffers, num_planes, plane_sizes, &first_index); if (allocated_buffers == 0) { /* There shouldn't be any buffers allocated, so first_index == 0 */ WARN_ON(first_index); dprintk(q, 1, "memory allocation failed\n"); ret = -ENOMEM; goto error; } /* * There is no point in continuing if we can't allocate the minimum * number of buffers needed by this vb2_queue. */ if (allocated_buffers < q->min_reqbufs_allocation) ret = -ENOMEM; /* * Check if driver can handle the allocated number of buffers. */ if (!ret && allocated_buffers < num_buffers) { num_buffers = allocated_buffers; /* * num_planes is set by the previous queue_setup(), but since it * signals to queue_setup() whether it is called from create_bufs() * vs reqbufs() we zero it here to signal that queue_setup() is * called for the reqbufs() case. */ num_planes = 0; ret = call_qop(q, queue_setup, q, &num_buffers, &num_planes, plane_sizes, q->alloc_devs); if (!ret && allocated_buffers < num_buffers) ret = -ENOMEM; /* * Either the driver has accepted a smaller number of buffers, * or .queue_setup() returned an error */ } mutex_lock(&q->mmap_lock); if (ret < 0) { /* * Note: __vb2_queue_free() will subtract 'allocated_buffers' * from already queued buffers and it will reset q->memory to * VB2_MEMORY_UNKNOWN. */ __vb2_queue_free(q, first_index, allocated_buffers); mutex_unlock(&q->mmap_lock); return ret; } mutex_unlock(&q->mmap_lock); /* * Return the number of successfully allocated buffers * to the userspace. */ *count = allocated_buffers; q->waiting_for_buffers = !q->is_output; q->is_busy = 1; return 0; error: mutex_lock(&q->mmap_lock); q->memory = VB2_MEMORY_UNKNOWN; mutex_unlock(&q->mmap_lock); vb2_core_free_buffers_storage(q); return ret; } EXPORT_SYMBOL_GPL(vb2_core_reqbufs); int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int flags, unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[], unsigned int *first_index) { unsigned int num_planes = 0, num_buffers, allocated_buffers; unsigned plane_sizes[VB2_MAX_PLANES] = { }; bool non_coherent_mem = flags & V4L2_MEMORY_FLAG_NON_COHERENT; unsigned int q_num_bufs = vb2_get_num_buffers(q); bool no_previous_buffers = !q_num_bufs; int ret = 0; if (q_num_bufs == q->max_num_buffers) { dprintk(q, 1, "maximum number of buffers already allocated\n"); return -ENOBUFS; } if (no_previous_buffers) { if (q->waiting_in_dqbuf && *count) { dprintk(q, 1, "another dup()ped fd is waiting for a buffer\n"); return -EBUSY; } memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); /* * Set this now to ensure that drivers see the correct q->memory * value in the queue_setup op. */ mutex_lock(&q->mmap_lock); ret = vb2_core_allocated_buffers_storage(q); q->memory = memory; mutex_unlock(&q->mmap_lock); if (ret) return ret; q->waiting_for_buffers = !q->is_output; set_queue_coherency(q, non_coherent_mem); } else { if (q->memory != memory) { dprintk(q, 1, "memory model mismatch\n"); return -EINVAL; } if (!verify_coherency_flags(q, non_coherent_mem)) return -EINVAL; } num_buffers = min(*count, q->max_num_buffers - q_num_bufs); if (requested_planes && requested_sizes) { num_planes = requested_planes; memcpy(plane_sizes, requested_sizes, sizeof(plane_sizes)); } /* * Ask the driver, whether the requested number of buffers, planes per * buffer and their sizes are acceptable */ ret = call_qop(q, queue_setup, q, &num_buffers, &num_planes, plane_sizes, q->alloc_devs); if (ret) goto error; /* Finally, allocate buffers and video memory */ allocated_buffers = __vb2_queue_alloc(q, memory, num_buffers, num_planes, plane_sizes, first_index); if (allocated_buffers == 0) { dprintk(q, 1, "memory allocation failed\n"); ret = -ENOMEM; goto error; } /* * Check if driver can handle the so far allocated number of buffers. */ if (allocated_buffers < num_buffers) { num_buffers = allocated_buffers; /* * num_buffers contains the total number of buffers, that the * queue driver has set up */ ret = call_qop(q, queue_setup, q, &num_buffers, &num_planes, plane_sizes, q->alloc_devs); if (!ret && allocated_buffers < num_buffers) ret = -ENOMEM; /* * Either the driver has accepted a smaller number of buffers, * or .queue_setup() returned an error */ } mutex_lock(&q->mmap_lock); if (ret < 0) { /* * Note: __vb2_queue_free() will subtract 'allocated_buffers' * from already queued buffers and it will reset q->memory to * VB2_MEMORY_UNKNOWN. */ __vb2_queue_free(q, *first_index, allocated_buffers); mutex_unlock(&q->mmap_lock); return -ENOMEM; } mutex_unlock(&q->mmap_lock); /* * Return the number of successfully allocated buffers * to the userspace. */ *count = allocated_buffers; q->is_busy = 1; return 0; error: if (no_previous_buffers) { mutex_lock(&q->mmap_lock); q->memory = VB2_MEMORY_UNKNOWN; mutex_unlock(&q->mmap_lock); } return ret; } EXPORT_SYMBOL_GPL(vb2_core_create_bufs); void *vb2_plane_vaddr(struct vb2_buffer *vb, unsigned int plane_no) { if (plane_no >= vb->num_planes || !vb->planes[plane_no].mem_priv) return NULL; return call_ptr_memop(vaddr, vb, vb->planes[plane_no].mem_priv); } EXPORT_SYMBOL_GPL(vb2_plane_vaddr); void *vb2_plane_cookie(struct vb2_buffer *vb, unsigned int plane_no) { if (plane_no >= vb->num_planes || !vb->planes[plane_no].mem_priv) return NULL; return call_ptr_memop(cookie, vb, vb->planes[plane_no].mem_priv); } EXPORT_SYMBOL_GPL(vb2_plane_cookie); void vb2_buffer_done(struct vb2_buffer *vb, enum vb2_buffer_state state) { struct vb2_queue *q = vb->vb2_queue; unsigned long flags; if (WARN_ON(vb->state != VB2_BUF_STATE_ACTIVE)) return; if (WARN_ON(state != VB2_BUF_STATE_DONE && state != VB2_BUF_STATE_ERROR && state != VB2_BUF_STATE_QUEUED)) state = VB2_BUF_STATE_ERROR; #ifdef CONFIG_VIDEO_ADV_DEBUG /* * Although this is not a callback, it still does have to balance * with the buf_queue op. So update this counter manually. */ vb->cnt_buf_done++; #endif dprintk(q, 4, "done processing on buffer %d, state: %s\n", vb->index, vb2_state_name(state)); if (state != VB2_BUF_STATE_QUEUED) __vb2_buf_mem_finish(vb); spin_lock_irqsave(&q->done_lock, flags); if (state == VB2_BUF_STATE_QUEUED) { vb->state = VB2_BUF_STATE_QUEUED; } else { /* Add the buffer to the done buffers list */ list_add_tail(&vb->done_entry, &q->done_list); vb->state = state; } atomic_dec(&q->owned_by_drv_count); if (state != VB2_BUF_STATE_QUEUED && vb->req_obj.req) { media_request_object_unbind(&vb->req_obj); media_request_object_put(&vb->req_obj); } spin_unlock_irqrestore(&q->done_lock, flags); trace_vb2_buf_done(q, vb); switch (state) { case VB2_BUF_STATE_QUEUED: return; default: /* Inform any processes that may be waiting for buffers */ wake_up(&q->done_wq); break; } } EXPORT_SYMBOL_GPL(vb2_buffer_done); void vb2_discard_done(struct vb2_queue *q) { struct vb2_buffer *vb; unsigned long flags; spin_lock_irqsave(&q->done_lock, flags); list_for_each_entry(vb, &q->done_list, done_entry) vb->state = VB2_BUF_STATE_ERROR; spin_unlock_irqrestore(&q->done_lock, flags); } EXPORT_SYMBOL_GPL(vb2_discard_done); /* * __prepare_mmap() - prepare an MMAP buffer */ static int __prepare_mmap(struct vb2_buffer *vb) { int ret = 0; ret = call_bufop(vb->vb2_queue, fill_vb2_buffer, vb, vb->planes); return ret ? ret : call_vb_qop(vb, buf_prepare, vb); } /* * __prepare_userptr() - prepare a USERPTR buffer */ static int __prepare_userptr(struct vb2_buffer *vb) { struct vb2_plane planes[VB2_MAX_PLANES]; struct vb2_queue *q = vb->vb2_queue; void *mem_priv; unsigned int plane; int ret = 0; bool reacquired = vb->planes[0].mem_priv == NULL; memset(planes, 0, sizeof(planes[0]) * vb->num_planes); /* Copy relevant information provided by the userspace */ ret = call_bufop(vb->vb2_queue, fill_vb2_buffer, vb, planes); if (ret) return ret; for (plane = 0; plane < vb->num_planes; ++plane) { /* Skip the plane if already verified */ if (vb->planes[plane].m.userptr && vb->planes[plane].m.userptr == planes[plane].m.userptr && vb->planes[plane].length == planes[plane].length) continue; dprintk(q, 3, "userspace address for plane %d changed, reacquiring memory\n", plane); /* Check if the provided plane buffer is large enough */ if (planes[plane].length < vb->planes[plane].min_length) { dprintk(q, 1, "provided buffer size %u is less than setup size %u for plane %d\n", planes[plane].length, vb->planes[plane].min_length, plane); ret = -EINVAL; goto err; } /* Release previously acquired memory if present */ if (vb->planes[plane].mem_priv) { if (!reacquired) { reacquired = true; vb->copied_timestamp = 0; call_void_vb_qop(vb, buf_cleanup, vb); } call_void_memop(vb, put_userptr, vb->planes[plane].mem_priv); } vb->planes[plane].mem_priv = NULL; vb->planes[plane].bytesused = 0; vb->planes[plane].length = 0; vb->planes[plane].m.userptr = 0; vb->planes[plane].data_offset = 0; /* Acquire each plane's memory */ mem_priv = call_ptr_memop(get_userptr, vb, q->alloc_devs[plane] ? : q->dev, planes[plane].m.userptr, planes[plane].length); if (IS_ERR(mem_priv)) { dprintk(q, 1, "failed acquiring userspace memory for plane %d\n", plane); ret = PTR_ERR(mem_priv); goto err; } vb->planes[plane].mem_priv = mem_priv; } /* * Now that everything is in order, copy relevant information * provided by userspace. */ for (plane = 0; plane < vb->num_planes; ++plane) { vb->planes[plane].bytesused = planes[plane].bytesused; vb->planes[plane].length = planes[plane].length; vb->planes[plane].m.userptr = planes[plane].m.userptr; vb->planes[plane].data_offset = planes[plane].data_offset; } if (reacquired) { /* * One or more planes changed, so we must call buf_init to do * the driver-specific initialization on the newly acquired * buffer, if provided. */ ret = call_vb_qop(vb, buf_init, vb); if (ret) { dprintk(q, 1, "buffer initialization failed\n"); goto err; } } ret = call_vb_qop(vb, buf_prepare, vb); if (ret) { dprintk(q, 1, "buffer preparation failed\n"); call_void_vb_qop(vb, buf_cleanup, vb); goto err; } return 0; err: /* In case of errors, release planes that were already acquired */ for (plane = 0; plane < vb->num_planes; ++plane) { if (vb->planes[plane].mem_priv) call_void_memop(vb, put_userptr, vb->planes[plane].mem_priv); vb->planes[plane].mem_priv = NULL; vb->planes[plane].m.userptr = 0; vb->planes[plane].length = 0; } return ret; } /* * __prepare_dmabuf() - prepare a DMABUF buffer */ static int __prepare_dmabuf(struct vb2_buffer *vb) { struct vb2_plane planes[VB2_MAX_PLANES]; struct vb2_queue *q = vb->vb2_queue; void *mem_priv; unsigned int plane, i; int ret = 0; bool reacquired = vb->planes[0].mem_priv == NULL; memset(planes, 0, sizeof(planes[0]) * vb->num_planes); /* Copy relevant information provided by the userspace */ ret = call_bufop(vb->vb2_queue, fill_vb2_buffer, vb, planes); if (ret) return ret; for (plane = 0; plane < vb->num_planes; ++plane) { struct dma_buf *dbuf = dma_buf_get(planes[plane].m.fd); planes[plane].dbuf = dbuf; if (IS_ERR_OR_NULL(dbuf)) { dprintk(q, 1, "invalid dmabuf fd for plane %d\n", plane); ret = -EINVAL; goto err_put_planes; } /* use DMABUF size if length is not provided */ if (planes[plane].length == 0) planes[plane].length = dbuf->size; if (planes[plane].length < vb->planes[plane].min_length) { dprintk(q, 1, "invalid dmabuf length %u for plane %d, minimum length %u\n", planes[plane].length, plane, vb->planes[plane].min_length); ret = -EINVAL; goto err_put_planes; } /* Skip the plane if already verified */ if (dbuf == vb->planes[plane].dbuf && vb->planes[plane].length == planes[plane].length) continue; dprintk(q, 3, "buffer for plane %d changed\n", plane); reacquired = true; } if (reacquired) { if (vb->planes[0].mem_priv) { vb->copied_timestamp = 0; call_void_vb_qop(vb, buf_cleanup, vb); __vb2_buf_dmabuf_put(vb); } for (plane = 0; plane < vb->num_planes; ++plane) { /* * This is an optimization to reduce dma_buf attachment/mapping. * When the same dma_buf is used for multiple planes, there is no need * to create duplicated attachments. */ for (i = 0; i < plane; ++i) { if (planes[plane].dbuf == vb->planes[i].dbuf && q->alloc_devs[plane] == q->alloc_devs[i]) { vb->planes[plane].dbuf_duplicated = true; vb->planes[plane].dbuf = vb->planes[i].dbuf; vb->planes[plane].mem_priv = vb->planes[i].mem_priv; break; } } if (vb->planes[plane].dbuf_duplicated) continue; /* Acquire each plane's memory */ mem_priv = call_ptr_memop(attach_dmabuf, vb, q->alloc_devs[plane] ? : q->dev, planes[plane].dbuf, planes[plane].length); if (IS_ERR(mem_priv)) { dprintk(q, 1, "failed to attach dmabuf\n"); ret = PTR_ERR(mem_priv); goto err_put_vb2_buf; } vb->planes[plane].dbuf = planes[plane].dbuf; vb->planes[plane].mem_priv = mem_priv; /* * This pins the buffer(s) with dma_buf_map_attachment()). It's done * here instead just before the DMA, while queueing the buffer(s) so * userspace knows sooner rather than later if the dma-buf map fails. */ ret = call_memop(vb, map_dmabuf, vb->planes[plane].mem_priv); if (ret) { dprintk(q, 1, "failed to map dmabuf for plane %d\n", plane); goto err_put_vb2_buf; } vb->planes[plane].dbuf_mapped = 1; } } else { for (plane = 0; plane < vb->num_planes; ++plane) dma_buf_put(planes[plane].dbuf); } /* * Now that everything is in order, copy relevant information * provided by userspace. */ for (plane = 0; plane < vb->num_planes; ++plane) { vb->planes[plane].bytesused = planes[plane].bytesused; vb->planes[plane].length = planes[plane].length; vb->planes[plane].m.fd = planes[plane].m.fd; vb->planes[plane].data_offset = planes[plane].data_offset; } if (reacquired) { /* * Call driver-specific initialization on the newly acquired buffer, * if provided. */ ret = call_vb_qop(vb, buf_init, vb); if (ret) { dprintk(q, 1, "buffer initialization failed\n"); goto err_put_vb2_buf; } } ret = call_vb_qop(vb, buf_prepare, vb); if (ret) { dprintk(q, 1, "buffer preparation failed\n"); call_void_vb_qop(vb, buf_cleanup, vb); goto err_put_vb2_buf; } return 0; err_put_planes: for (plane = 0; plane < vb->num_planes; ++plane) { if (!IS_ERR_OR_NULL(planes[plane].dbuf)) dma_buf_put(planes[plane].dbuf); } err_put_vb2_buf: /* In case of errors, release planes that were already acquired */ __vb2_buf_dmabuf_put(vb); return ret; } /* * __enqueue_in_driver() - enqueue a vb2_buffer in driver for processing */ static void __enqueue_in_driver(struct vb2_buffer *vb) { struct vb2_queue *q = vb->vb2_queue; vb->state = VB2_BUF_STATE_ACTIVE; atomic_inc(&q->owned_by_drv_count); trace_vb2_buf_queue(q, vb); call_void_vb_qop(vb, buf_queue, vb); } static int __buf_prepare(struct vb2_buffer *vb) { struct vb2_queue *q = vb->vb2_queue; enum vb2_buffer_state orig_state = vb->state; int ret; if (q->error) { dprintk(q, 1, "fatal error occurred on queue\n"); return -EIO; } if (vb->prepared) return 0; WARN_ON(vb->synced); if (q->is_output) { ret = call_vb_qop(vb, buf_out_validate, vb); if (ret) { dprintk(q, 1, "buffer validation failed\n"); return ret; } } vb->state = VB2_BUF_STATE_PREPARING; switch (q->memory) { case VB2_MEMORY_MMAP: ret = __prepare_mmap(vb); break; case VB2_MEMORY_USERPTR: ret = __prepare_userptr(vb); break; case VB2_MEMORY_DMABUF: ret = __prepare_dmabuf(vb); break; default: WARN(1, "Invalid queue type\n"); ret = -EINVAL; break; } if (ret) { dprintk(q, 1, "buffer preparation failed: %d\n", ret); vb->state = orig_state; return ret; } __vb2_buf_mem_prepare(vb); vb->prepared = 1; vb->state = orig_state; return 0; } static int vb2_req_prepare(struct media_request_object *obj) { struct vb2_buffer *vb = container_of(obj, struct vb2_buffer, req_obj); int ret; if (WARN_ON(vb->state != VB2_BUF_STATE_IN_REQUEST)) return -EINVAL; mutex_lock(vb->vb2_queue->lock); ret = __buf_prepare(vb); mutex_unlock(vb->vb2_queue->lock); return ret; } static void __vb2_dqbuf(struct vb2_buffer *vb); static void vb2_req_unprepare(struct media_request_object *obj) { struct vb2_buffer *vb = container_of(obj, struct vb2_buffer, req_obj); mutex_lock(vb->vb2_queue->lock); __vb2_dqbuf(vb); vb->state = VB2_BUF_STATE_IN_REQUEST; mutex_unlock(vb->vb2_queue->lock); WARN_ON(!vb->req_obj.req); } static void vb2_req_queue(struct media_request_object *obj) { struct vb2_buffer *vb = container_of(obj, struct vb2_buffer, req_obj); int err; mutex_lock(vb->vb2_queue->lock); /* * There is no method to propagate an error from vb2_core_qbuf(), * so if this returns a non-0 value, then WARN. * * The only exception is -EIO which is returned if q->error is * set. We just ignore that, and expect this will be caught the * next time vb2_req_prepare() is called. */ err = vb2_core_qbuf(vb->vb2_queue, vb, NULL, NULL); WARN_ON_ONCE(err && err != -EIO); mutex_unlock(vb->vb2_queue->lock); } static void vb2_req_unbind(struct media_request_object *obj) { struct vb2_buffer *vb = container_of(obj, struct vb2_buffer, req_obj); if (vb->state == VB2_BUF_STATE_IN_REQUEST) call_void_bufop(vb->vb2_queue, init_buffer, vb); } static void vb2_req_release(struct media_request_object *obj) { struct vb2_buffer *vb = container_of(obj, struct vb2_buffer, req_obj); if (vb->state == VB2_BUF_STATE_IN_REQUEST) { vb->state = VB2_BUF_STATE_DEQUEUED; if (vb->request) media_request_put(vb->request); vb->request = NULL; } } static const struct media_request_object_ops vb2_core_req_ops = { .prepare = vb2_req_prepare, .unprepare = vb2_req_unprepare, .queue = vb2_req_queue, .unbind = vb2_req_unbind, .release = vb2_req_release, }; bool vb2_request_object_is_buffer(struct media_request_object *obj) { return obj->ops == &vb2_core_req_ops; } EXPORT_SYMBOL_GPL(vb2_request_object_is_buffer); unsigned int vb2_request_buffer_cnt(struct media_request *req) { struct media_request_object *obj; unsigned long flags; unsigned int buffer_cnt = 0; spin_lock_irqsave(&req->lock, flags); list_for_each_entry(obj, &req->objects, list) if (vb2_request_object_is_buffer(obj)) buffer_cnt++; spin_unlock_irqrestore(&req->lock, flags); return buffer_cnt; } EXPORT_SYMBOL_GPL(vb2_request_buffer_cnt); int vb2_core_prepare_buf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb) { int ret; if (vb->state != VB2_BUF_STATE_DEQUEUED) { dprintk(q, 1, "invalid buffer state %s\n", vb2_state_name(vb->state)); return -EINVAL; } if (vb->prepared) { dprintk(q, 1, "buffer already prepared\n"); return -EINVAL; } ret = __buf_prepare(vb); if (ret) return ret; /* Fill buffer information for the userspace */ call_void_bufop(q, fill_user_buffer, vb, pb); dprintk(q, 2, "prepare of buffer %d succeeded\n", vb->index); return 0; } EXPORT_SYMBOL_GPL(vb2_core_prepare_buf); int vb2_core_remove_bufs(struct vb2_queue *q, unsigned int start, unsigned int count) { unsigned int i, ret = 0; unsigned int q_num_bufs = vb2_get_num_buffers(q); if (count == 0) return 0; if (count > q_num_bufs) return -EINVAL; if (start > q->max_num_buffers - count) return -EINVAL; mutex_lock(&q->mmap_lock); /* Check that all buffers in the range exist */ for (i = start; i < start + count; i++) { struct vb2_buffer *vb = vb2_get_buffer(q, i); if (!vb) { ret = -EINVAL; goto unlock; } if (vb->state != VB2_BUF_STATE_DEQUEUED) { ret = -EBUSY; goto unlock; } } __vb2_queue_free(q, start, count); dprintk(q, 2, "%u buffers removed\n", count); unlock: mutex_unlock(&q->mmap_lock); return ret; } EXPORT_SYMBOL_GPL(vb2_core_remove_bufs); /* * vb2_start_streaming() - Attempt to start streaming. * @q: videobuf2 queue * * Attempt to start streaming. When this function is called there must be * at least q->min_queued_buffers queued up (i.e. the minimum * number of buffers required for the DMA engine to function). If the * @start_streaming op fails it is supposed to return all the driver-owned * buffers back to vb2 in state QUEUED. Check if that happened and if * not warn and reclaim them forcefully. */ static int vb2_start_streaming(struct vb2_queue *q) { struct vb2_buffer *vb; int ret; /* * If any buffers were queued before streamon, * we can now pass them to driver for processing. */ list_for_each_entry(vb, &q->queued_list, queued_entry) __enqueue_in_driver(vb); /* Tell the driver to start streaming */ q->start_streaming_called = 1; ret = call_qop(q, start_streaming, q, atomic_read(&q->owned_by_drv_count)); if (!ret) return 0; q->start_streaming_called = 0; dprintk(q, 1, "driver refused to start streaming\n"); /* * If you see this warning, then the driver isn't cleaning up properly * after a failed start_streaming(). See the start_streaming() * documentation in videobuf2-core.h for more information how buffers * should be returned to vb2 in start_streaming(). */ if (WARN_ON(atomic_read(&q->owned_by_drv_count))) { unsigned i; /* * Forcefully reclaim buffers if the driver did not * correctly return them to vb2. */ for (i = 0; i < q->max_num_buffers; ++i) { vb = vb2_get_buffer(q, i); if (!vb) continue; if (vb->state == VB2_BUF_STATE_ACTIVE) vb2_buffer_done(vb, VB2_BUF_STATE_QUEUED); } /* Must be zero now */ WARN_ON(atomic_read(&q->owned_by_drv_count)); } /* * If done_list is not empty, then start_streaming() didn't call * vb2_buffer_done(vb, VB2_BUF_STATE_QUEUED) but STATE_ERROR or * STATE_DONE. */ WARN_ON(!list_empty(&q->done_list)); return ret; } int vb2_core_qbuf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb, struct media_request *req) { enum vb2_buffer_state orig_state; int ret; if (q->error) { dprintk(q, 1, "fatal error occurred on queue\n"); return -EIO; } if (!req && vb->state != VB2_BUF_STATE_IN_REQUEST && q->requires_requests) { dprintk(q, 1, "qbuf requires a request\n"); return -EBADR; } if ((req && q->uses_qbuf) || (!req && vb->state != VB2_BUF_STATE_IN_REQUEST && q->uses_requests)) { dprintk(q, 1, "queue in wrong mode (qbuf vs requests)\n"); return -EBUSY; } if (req) { int ret; q->uses_requests = 1; if (vb->state != VB2_BUF_STATE_DEQUEUED) { dprintk(q, 1, "buffer %d not in dequeued state\n", vb->index); return -EINVAL; } if (q->is_output && !vb->prepared) { ret = call_vb_qop(vb, buf_out_validate, vb); if (ret) { dprintk(q, 1, "buffer validation failed\n"); return ret; } } media_request_object_init(&vb->req_obj); /* Make sure the request is in a safe state for updating. */ ret = media_request_lock_for_update(req); if (ret) return ret; ret = media_request_object_bind(req, &vb2_core_req_ops, q, true, &vb->req_obj); media_request_unlock_for_update(req); if (ret) return ret; vb->state = VB2_BUF_STATE_IN_REQUEST; /* * Increment the refcount and store the request. * The request refcount is decremented again when the * buffer is dequeued. This is to prevent vb2_buffer_done() * from freeing the request from interrupt context, which can * happen if the application closed the request fd after * queueing the request. */ media_request_get(req); vb->request = req; /* Fill buffer information for the userspace */ if (pb) { call_void_bufop(q, copy_timestamp, vb, pb); call_void_bufop(q, fill_user_buffer, vb, pb); } dprintk(q, 2, "qbuf of buffer %d succeeded\n", vb->index); return 0; } if (vb->state != VB2_BUF_STATE_IN_REQUEST) q->uses_qbuf = 1; switch (vb->state) { case VB2_BUF_STATE_DEQUEUED: case VB2_BUF_STATE_IN_REQUEST: if (!vb->prepared) { ret = __buf_prepare(vb); if (ret) return ret; } break; case VB2_BUF_STATE_PREPARING: dprintk(q, 1, "buffer still being prepared\n"); return -EINVAL; default: dprintk(q, 1, "invalid buffer state %s\n", vb2_state_name(vb->state)); return -EINVAL; } /* * Add to the queued buffers list, a buffer will stay on it until * dequeued in dqbuf. */ orig_state = vb->state; list_add_tail(&vb->queued_entry, &q->queued_list); q->queued_count++; q->waiting_for_buffers = false; vb->state = VB2_BUF_STATE_QUEUED; if (pb) call_void_bufop(q, copy_timestamp, vb, pb); trace_vb2_qbuf(q, vb); /* * If already streaming, give the buffer to driver for processing. * If not, the buffer will be given to driver on next streamon. */ if (q->start_streaming_called) __enqueue_in_driver(vb); /* Fill buffer information for the userspace */ if (pb) call_void_bufop(q, fill_user_buffer, vb, pb); /* * If streamon has been called, and we haven't yet called * start_streaming() since not enough buffers were queued, and * we now have reached the minimum number of queued buffers, * then we can finally call start_streaming(). */ if (q->streaming && !q->start_streaming_called && q->queued_count >= q->min_queued_buffers) { ret = vb2_start_streaming(q); if (ret) { /* * Since vb2_core_qbuf will return with an error, * we should return it to state DEQUEUED since * the error indicates that the buffer wasn't queued. */ list_del(&vb->queued_entry); q->queued_count--; vb->state = orig_state; return ret; } } dprintk(q, 2, "qbuf of buffer %d succeeded\n", vb->index); return 0; } EXPORT_SYMBOL_GPL(vb2_core_qbuf); /* * __vb2_wait_for_done_vb() - wait for a buffer to become available * for dequeuing * * Will sleep if required for nonblocking == false. */ static int __vb2_wait_for_done_vb(struct vb2_queue *q, int nonblocking) { /* * All operations on vb_done_list are performed under done_lock * spinlock protection. However, buffers may be removed from * it and returned to userspace only while holding both driver's * lock and the done_lock spinlock. Thus we can be sure that as * long as we hold the driver's lock, the list will remain not * empty if list_empty() check succeeds. */ for (;;) { int ret; if (q->waiting_in_dqbuf) { dprintk(q, 1, "another dup()ped fd is waiting for a buffer\n"); return -EBUSY; } if (!q->streaming) { dprintk(q, 1, "streaming off, will not wait for buffers\n"); return -EINVAL; } if (q->error) { dprintk(q, 1, "Queue in error state, will not wait for buffers\n"); return -EIO; } if (q->last_buffer_dequeued) { dprintk(q, 3, "last buffer dequeued already, will not wait for buffers\n"); return -EPIPE; } if (!list_empty(&q->done_list)) { /* * Found a buffer that we were waiting for. */ break; } if (nonblocking) { dprintk(q, 3, "nonblocking and no buffers to dequeue, will not wait\n"); return -EAGAIN; } q->waiting_in_dqbuf = 1; /* * We are streaming and blocking, wait for another buffer to * become ready or for streamoff. Driver's lock is released to * allow streamoff or qbuf to be called while waiting. */ if (q->ops->wait_prepare) call_void_qop(q, wait_prepare, q); else if (q->lock) mutex_unlock(q->lock); /* * All locks have been released, it is safe to sleep now. */ dprintk(q, 3, "will sleep waiting for buffers\n"); ret = wait_event_interruptible(q->done_wq, !list_empty(&q->done_list) || !q->streaming || q->error); if (q->ops->wait_finish) call_void_qop(q, wait_finish, q); else if (q->lock) mutex_lock(q->lock); q->waiting_in_dqbuf = 0; /* * We need to reevaluate both conditions again after reacquiring * the locks or return an error if one occurred. */ if (ret) { dprintk(q, 1, "sleep was interrupted\n"); return ret; } } return 0; } /* * __vb2_get_done_vb() - get a buffer ready for dequeuing * * Will sleep if required for nonblocking == false. */ static int __vb2_get_done_vb(struct vb2_queue *q, struct vb2_buffer **vb, void *pb, int nonblocking) { unsigned long flags; int ret = 0; /* * Wait for at least one buffer to become available on the done_list. */ ret = __vb2_wait_for_done_vb(q, nonblocking); if (ret) return ret; /* * Driver's lock has been held since we last verified that done_list * is not empty, so no need for another list_empty(done_list) check. */ spin_lock_irqsave(&q->done_lock, flags); *vb = list_first_entry(&q->done_list, struct vb2_buffer, done_entry); /* * Only remove the buffer from done_list if all planes can be * handled. Some cases such as V4L2 file I/O and DVB have pb * == NULL; skip the check then as there's nothing to verify. */ if (pb) ret = call_bufop(q, verify_planes_array, *vb, pb); if (!ret) list_del(&(*vb)->done_entry); spin_unlock_irqrestore(&q->done_lock, flags); return ret; } int vb2_wait_for_all_buffers(struct vb2_queue *q) { if (!q->streaming) { dprintk(q, 1, "streaming off, will not wait for buffers\n"); return -EINVAL; } if (q->start_streaming_called) wait_event(q->done_wq, !atomic_read(&q->owned_by_drv_count)); return 0; } EXPORT_SYMBOL_GPL(vb2_wait_for_all_buffers); /* * __vb2_dqbuf() - bring back the buffer to the DEQUEUED state */ static void __vb2_dqbuf(struct vb2_buffer *vb) { struct vb2_queue *q = vb->vb2_queue; /* nothing to do if the buffer is already dequeued */ if (vb->state == VB2_BUF_STATE_DEQUEUED) return; vb->state = VB2_BUF_STATE_DEQUEUED; call_void_bufop(q, init_buffer, vb); } int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb, bool nonblocking) { struct vb2_buffer *vb = NULL; int ret; ret = __vb2_get_done_vb(q, &vb, pb, nonblocking); if (ret < 0) return ret; switch (vb->state) { case VB2_BUF_STATE_DONE: dprintk(q, 3, "returning done buffer\n"); break; case VB2_BUF_STATE_ERROR: dprintk(q, 3, "returning done buffer with errors\n"); break; default: dprintk(q, 1, "invalid buffer state %s\n", vb2_state_name(vb->state)); return -EINVAL; } call_void_vb_qop(vb, buf_finish, vb); vb->prepared = 0; if (pindex) *pindex = vb->index; /* Fill buffer information for the userspace */ if (pb) call_void_bufop(q, fill_user_buffer, vb, pb); /* Remove from vb2 queue */ list_del(&vb->queued_entry); q->queued_count--; trace_vb2_dqbuf(q, vb); /* go back to dequeued state */ __vb2_dqbuf(vb); if (WARN_ON(vb->req_obj.req)) { media_request_object_unbind(&vb->req_obj); media_request_object_put(&vb->req_obj); } if (vb->request) media_request_put(vb->request); vb->request = NULL; dprintk(q, 2, "dqbuf of buffer %d, state: %s\n", vb->index, vb2_state_name(vb->state)); return 0; } EXPORT_SYMBOL_GPL(vb2_core_dqbuf); /* * __vb2_queue_cancel() - cancel and stop (pause) streaming * * Removes all queued buffers from driver's queue and all buffers queued by * userspace from vb2's queue. Returns to state after reqbufs. */ static void __vb2_queue_cancel(struct vb2_queue *q) { unsigned int i; /* * Tell driver to stop all transactions and release all queued * buffers. */ if (q->start_streaming_called) call_void_qop(q, stop_streaming, q); if (q->streaming) call_void_qop(q, unprepare_streaming, q); /* * If you see this warning, then the driver isn't cleaning up properly * in stop_streaming(). See the stop_streaming() documentation in * videobuf2-core.h for more information how buffers should be returned * to vb2 in stop_streaming(). */ if (WARN_ON(atomic_read(&q->owned_by_drv_count))) { for (i = 0; i < q->max_num_buffers; i++) { struct vb2_buffer *vb = vb2_get_buffer(q, i); if (!vb) continue; if (vb->state == VB2_BUF_STATE_ACTIVE) { pr_warn("driver bug: stop_streaming operation is leaving buffer %u in active state\n", vb->index); vb2_buffer_done(vb, VB2_BUF_STATE_ERROR); } } /* Must be zero now */ WARN_ON(atomic_read(&q->owned_by_drv_count)); } q->streaming = 0; q->start_streaming_called = 0; q->queued_count = 0; q->error = 0; q->uses_requests = 0; q->uses_qbuf = 0; /* * Remove all buffers from vb2's list... */ INIT_LIST_HEAD(&q->queued_list); /* * ...and done list; userspace will not receive any buffers it * has not already dequeued before initiating cancel. */ INIT_LIST_HEAD(&q->done_list); atomic_set(&q->owned_by_drv_count, 0); wake_up_all(&q->done_wq); /* * Reinitialize all buffers for next use. * Make sure to call buf_finish for any queued buffers. Normally * that's done in dqbuf, but that's not going to happen when we * cancel the whole queue. Note: this code belongs here, not in * __vb2_dqbuf() since in vb2_core_dqbuf() there is a critical * call to __fill_user_buffer() after buf_finish(). That order can't * be changed, so we can't move the buf_finish() to __vb2_dqbuf(). */ for (i = 0; i < q->max_num_buffers; i++) { struct vb2_buffer *vb; struct media_request *req; vb = vb2_get_buffer(q, i); if (!vb) continue; req = vb->req_obj.req; /* * If a request is associated with this buffer, then * call buf_request_cancel() to give the driver to complete() * related request objects. Otherwise those objects would * never complete. */ if (req) { enum media_request_state state; unsigned long flags; spin_lock_irqsave(&req->lock, flags); state = req->state; spin_unlock_irqrestore(&req->lock, flags); if (state == MEDIA_REQUEST_STATE_QUEUED) call_void_vb_qop(vb, buf_request_complete, vb); } __vb2_buf_mem_finish(vb); if (vb->prepared) { call_void_vb_qop(vb, buf_finish, vb); vb->prepared = 0; } __vb2_dqbuf(vb); if (vb->req_obj.req) { media_request_object_unbind(&vb->req_obj); media_request_object_put(&vb->req_obj); } if (vb->request) media_request_put(vb->request); vb->request = NULL; vb->copied_timestamp = 0; } } int vb2_core_streamon(struct vb2_queue *q, unsigned int type) { unsigned int q_num_bufs = vb2_get_num_buffers(q); int ret; if (type != q->type) { dprintk(q, 1, "invalid stream type\n"); return -EINVAL; } if (q->streaming) { dprintk(q, 3, "already streaming\n"); return 0; } if (!q_num_bufs) { dprintk(q, 1, "no buffers have been allocated\n"); return -EINVAL; } if (q_num_bufs < q->min_queued_buffers) { dprintk(q, 1, "need at least %u allocated buffers\n", q->min_queued_buffers); return -EINVAL; } ret = call_qop(q, prepare_streaming, q); if (ret) return ret; /* * Tell driver to start streaming provided sufficient buffers * are available. */ if (q->queued_count >= q->min_queued_buffers) { ret = vb2_start_streaming(q); if (ret) goto unprepare; } q->streaming = 1; dprintk(q, 3, "successful\n"); return 0; unprepare: call_void_qop(q, unprepare_streaming, q); return ret; } EXPORT_SYMBOL_GPL(vb2_core_streamon); void vb2_queue_error(struct vb2_queue *q) { q->error = 1; wake_up_all(&q->done_wq); } EXPORT_SYMBOL_GPL(vb2_queue_error); int vb2_core_streamoff(struct vb2_queue *q, unsigned int type) { if (type != q->type) { dprintk(q, 1, "invalid stream type\n"); return -EINVAL; } /* * Cancel will pause streaming and remove all buffers from the driver * and vb2, effectively returning control over them to userspace. * * Note that we do this even if q->streaming == 0: if you prepare or * queue buffers, and then call streamoff without ever having called * streamon, you would still expect those buffers to be returned to * their normal dequeued state. */ __vb2_queue_cancel(q); q->waiting_for_buffers = !q->is_output; q->last_buffer_dequeued = false; dprintk(q, 3, "successful\n"); return 0; } EXPORT_SYMBOL_GPL(vb2_core_streamoff); /* * __find_plane_by_offset() - find plane associated with the given offset */ static int __find_plane_by_offset(struct vb2_queue *q, unsigned long offset, struct vb2_buffer **vb, unsigned int *plane) { unsigned int buffer; /* * Sanity checks to ensure the lock is held, MEMORY_MMAP is * used and fileio isn't active. */ lockdep_assert_held(&q->mmap_lock); if (q->memory != VB2_MEMORY_MMAP) { dprintk(q, 1, "queue is not currently set up for mmap\n"); return -EINVAL; } if (vb2_fileio_is_active(q)) { dprintk(q, 1, "file io in progress\n"); return -EBUSY; } /* Get buffer and plane from the offset */ buffer = (offset >> PLANE_INDEX_SHIFT) & BUFFER_INDEX_MASK; *plane = (offset >> PAGE_SHIFT) & PLANE_INDEX_MASK; *vb = vb2_get_buffer(q, buffer); if (!*vb) return -EINVAL; if (*plane >= (*vb)->num_planes) return -EINVAL; return 0; } int vb2_core_expbuf(struct vb2_queue *q, int *fd, unsigned int type, struct vb2_buffer *vb, unsigned int plane, unsigned int flags) { struct vb2_plane *vb_plane; int ret; struct dma_buf *dbuf; if (q->memory != VB2_MEMORY_MMAP) { dprintk(q, 1, "queue is not currently set up for mmap\n"); return -EINVAL; } if (!q->mem_ops->get_dmabuf) { dprintk(q, 1, "queue does not support DMA buffer exporting\n"); return -EINVAL; } if (flags & ~(O_CLOEXEC | O_ACCMODE)) { dprintk(q, 1, "queue does support only O_CLOEXEC and access mode flags\n"); return -EINVAL; } if (type != q->type) { dprintk(q, 1, "invalid buffer type\n"); return -EINVAL; } if (plane >= vb->num_planes) { dprintk(q, 1, "buffer plane out of range\n"); return -EINVAL; } if (vb2_fileio_is_active(q)) { dprintk(q, 1, "expbuf: file io in progress\n"); return -EBUSY; } vb_plane = &vb->planes[plane]; dbuf = call_ptr_memop(get_dmabuf, vb, vb_plane->mem_priv, flags & O_ACCMODE); if (IS_ERR_OR_NULL(dbuf)) { dprintk(q, 1, "failed to export buffer %d, plane %d\n", vb->index, plane); return -EINVAL; } ret = dma_buf_fd(dbuf, flags & ~O_ACCMODE); if (ret < 0) { dprintk(q, 3, "buffer %d, plane %d failed to export (%d)\n", vb->index, plane, ret); dma_buf_put(dbuf); return ret; } dprintk(q, 3, "buffer %d, plane %d exported as %d descriptor\n", vb->index, plane, ret); *fd = ret; return 0; } EXPORT_SYMBOL_GPL(vb2_core_expbuf); int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma) { unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; struct vb2_buffer *vb; unsigned int plane = 0; int ret; unsigned long length; /* * Check memory area access mode. */ if (!(vma->vm_flags & VM_SHARED)) { dprintk(q, 1, "invalid vma flags, VM_SHARED needed\n"); return -EINVAL; } if (q->is_output) { if (!(vma->vm_flags & VM_WRITE)) { dprintk(q, 1, "invalid vma flags, VM_WRITE needed\n"); return -EINVAL; } } else { if (!(vma->vm_flags & VM_READ)) { dprintk(q, 1, "invalid vma flags, VM_READ needed\n"); return -EINVAL; } } mutex_lock(&q->mmap_lock); /* * Find the plane corresponding to the offset passed by userspace. This * will return an error if not MEMORY_MMAP or file I/O is in progress. */ ret = __find_plane_by_offset(q, offset, &vb, &plane); if (ret) goto unlock; /* * MMAP requires page_aligned buffers. * The buffer length was page_aligned at __vb2_buf_mem_alloc(), * so, we need to do the same here. */ length = PAGE_ALIGN(vb->planes[plane].length); if (length < (vma->vm_end - vma->vm_start)) { dprintk(q, 1, "MMAP invalid, as it would overflow buffer length\n"); ret = -EINVAL; goto unlock; } /* * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer, * not as a in-buffer offset. We always want to mmap a whole buffer * from its beginning. */ vma->vm_pgoff = 0; ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma); unlock: mutex_unlock(&q->mmap_lock); if (ret) return ret; dprintk(q, 3, "buffer %u, plane %d successfully mapped\n", vb->index, plane); return 0; } EXPORT_SYMBOL_GPL(vb2_mmap); #ifndef CONFIG_MMU unsigned long vb2_get_unmapped_area(struct vb2_queue *q, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long offset = pgoff << PAGE_SHIFT; struct vb2_buffer *vb; unsigned int plane; void *vaddr; int ret; mutex_lock(&q->mmap_lock); /* * Find the plane corresponding to the offset passed by userspace. This * will return an error if not MEMORY_MMAP or file I/O is in progress. */ ret = __find_plane_by_offset(q, offset, &vb, &plane); if (ret) goto unlock; vaddr = vb2_plane_vaddr(vb, plane); mutex_unlock(&q->mmap_lock); return vaddr ? (unsigned long)vaddr : -EINVAL; unlock: mutex_unlock(&q->mmap_lock); return ret; } EXPORT_SYMBOL_GPL(vb2_get_unmapped_area); #endif int vb2_core_queue_init(struct vb2_queue *q) { /* * Sanity check */ /* * For drivers who don't support max_num_buffers ensure * a backward compatibility. */ if (!q->max_num_buffers) q->max_num_buffers = VB2_MAX_FRAME; /* The maximum is limited by offset cookie encoding pattern */ q->max_num_buffers = min_t(unsigned int, q->max_num_buffers, MAX_BUFFER_INDEX); if (WARN_ON(!q) || WARN_ON(!q->ops) || WARN_ON(!q->mem_ops) || WARN_ON(!q->type) || WARN_ON(!q->io_modes) || WARN_ON(!q->ops->queue_setup) || WARN_ON(!q->ops->buf_queue)) return -EINVAL; if (WARN_ON(q->max_num_buffers < VB2_MAX_FRAME) || WARN_ON(q->min_queued_buffers > q->max_num_buffers)) return -EINVAL; if (WARN_ON(q->requires_requests && !q->supports_requests)) return -EINVAL; /* * This combination is not allowed since a non-zero value of * q->min_queued_buffers can cause vb2_core_qbuf() to fail if * it has to call start_streaming(), and the Request API expects * that queueing a request (and thus queueing a buffer contained * in that request) will always succeed. There is no method of * propagating an error back to userspace. */ if (WARN_ON(q->supports_requests && q->min_queued_buffers)) return -EINVAL; /* * If the driver needs 'min_queued_buffers' in the queue before * calling start_streaming() then the minimum requirement is * 'min_queued_buffers + 1' to keep at least one buffer available * for userspace. */ if (q->min_reqbufs_allocation < q->min_queued_buffers + 1) q->min_reqbufs_allocation = q->min_queued_buffers + 1; if (WARN_ON(q->min_reqbufs_allocation > q->max_num_buffers)) return -EINVAL; /* Either both or none are set */ if (WARN_ON(!q->ops->wait_prepare ^ !q->ops->wait_finish)) return -EINVAL; /* Warn if q->lock is NULL and no custom wait_prepare is provided */ if (WARN_ON(!q->lock && !q->ops->wait_prepare)) return -EINVAL; INIT_LIST_HEAD(&q->queued_list); INIT_LIST_HEAD(&q->done_list); spin_lock_init(&q->done_lock); mutex_init(&q->mmap_lock); init_waitqueue_head(&q->done_wq); q->memory = VB2_MEMORY_UNKNOWN; if (q->buf_struct_size == 0) q->buf_struct_size = sizeof(struct vb2_buffer); if (q->bidirectional) q->dma_dir = DMA_BIDIRECTIONAL; else q->dma_dir = q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE; if (q->name[0] == '\0') snprintf(q->name, sizeof(q->name), "%s-%p", q->is_output ? "out" : "cap", q); return 0; } EXPORT_SYMBOL_GPL(vb2_core_queue_init); static int __vb2_init_fileio(struct vb2_queue *q, int read); static int __vb2_cleanup_fileio(struct vb2_queue *q); void vb2_core_queue_release(struct vb2_queue *q) { __vb2_cleanup_fileio(q); __vb2_queue_cancel(q); mutex_lock(&q->mmap_lock); __vb2_queue_free(q, 0, q->max_num_buffers); vb2_core_free_buffers_storage(q); q->is_busy = 0; mutex_unlock(&q->mmap_lock); } EXPORT_SYMBOL_GPL(vb2_core_queue_release); __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file, poll_table *wait) { __poll_t req_events = poll_requested_events(wait); struct vb2_buffer *vb = NULL; unsigned long flags; /* * poll_wait() MUST be called on the first invocation on all the * potential queues of interest, even if we are not interested in their * events during this first call. Failure to do so will result in * queue's events to be ignored because the poll_table won't be capable * of adding new wait queues thereafter. */ poll_wait(file, &q->done_wq, wait); if (!q->is_output && !(req_events & (EPOLLIN | EPOLLRDNORM))) return 0; if (q->is_output && !(req_events & (EPOLLOUT | EPOLLWRNORM))) return 0; /* * Start file I/O emulator only if streaming API has not been used yet. */ if (vb2_get_num_buffers(q) == 0 && !vb2_fileio_is_active(q)) { if (!q->is_output && (q->io_modes & VB2_READ) && (req_events & (EPOLLIN | EPOLLRDNORM))) { if (__vb2_init_fileio(q, 1)) return EPOLLERR; } if (q->is_output && (q->io_modes & VB2_WRITE) && (req_events & (EPOLLOUT | EPOLLWRNORM))) { if (__vb2_init_fileio(q, 0)) return EPOLLERR; /* * Write to OUTPUT queue can be done immediately. */ return EPOLLOUT | EPOLLWRNORM; } } /* * There is nothing to wait for if the queue isn't streaming, or if the * error flag is set. */ if (!vb2_is_streaming(q) || q->error) return EPOLLERR; /* * If this quirk is set and QBUF hasn't been called yet then * return EPOLLERR as well. This only affects capture queues, output * queues will always initialize waiting_for_buffers to false. * This quirk is set by V4L2 for backwards compatibility reasons. */ if (q->quirk_poll_must_check_waiting_for_buffers && q->waiting_for_buffers && (req_events & (EPOLLIN | EPOLLRDNORM))) return EPOLLERR; /* * For output streams you can call write() as long as there are fewer * buffers queued than there are buffers available. */ if (q->is_output && q->fileio && q->queued_count < vb2_get_num_buffers(q)) return EPOLLOUT | EPOLLWRNORM; if (list_empty(&q->done_list)) { /* * If the last buffer was dequeued from a capture queue, * return immediately. DQBUF will return -EPIPE. */ if (q->last_buffer_dequeued) return EPOLLIN | EPOLLRDNORM; } /* * Take first buffer available for dequeuing. */ spin_lock_irqsave(&q->done_lock, flags); if (!list_empty(&q->done_list)) vb = list_first_entry(&q->done_list, struct vb2_buffer, done_entry); spin_unlock_irqrestore(&q->done_lock, flags); if (vb && (vb->state == VB2_BUF_STATE_DONE || vb->state == VB2_BUF_STATE_ERROR)) { return (q->is_output) ? EPOLLOUT | EPOLLWRNORM : EPOLLIN | EPOLLRDNORM; } return 0; } EXPORT_SYMBOL_GPL(vb2_core_poll); /* * struct vb2_fileio_buf - buffer context used by file io emulator * * vb2 provides a compatibility layer and emulator of file io (read and * write) calls on top of streaming API. This structure is used for * tracking context related to the buffers. */ struct vb2_fileio_buf { void *vaddr; unsigned int size; unsigned int pos; unsigned int queued:1; }; /* * struct vb2_fileio_data - queue context used by file io emulator * * @cur_index: the index of the buffer currently being read from or * written to. If equal to number of buffers in the vb2_queue * then a new buffer must be dequeued. * @initial_index: in the read() case all buffers are queued up immediately * in __vb2_init_fileio() and __vb2_perform_fileio() just cycles * buffers. However, in the write() case no buffers are initially * queued, instead whenever a buffer is full it is queued up by * __vb2_perform_fileio(). Only once all available buffers have * been queued up will __vb2_perform_fileio() start to dequeue * buffers. This means that initially __vb2_perform_fileio() * needs to know what buffer index to use when it is queuing up * the buffers for the first time. That initial index is stored * in this field. Once it is equal to number of buffers in the * vb2_queue all available buffers have been queued and * __vb2_perform_fileio() should start the normal dequeue/queue cycle. * * vb2 provides a compatibility layer and emulator of file io (read and * write) calls on top of streaming API. For proper operation it required * this structure to save the driver state between each call of the read * or write function. */ struct vb2_fileio_data { unsigned int count; unsigned int type; unsigned int memory; struct vb2_fileio_buf bufs[VB2_MAX_FRAME]; unsigned int cur_index; unsigned int initial_index; unsigned int q_count; unsigned int dq_count; unsigned read_once:1; unsigned write_immediately:1; }; /* * __vb2_init_fileio() - initialize file io emulator * @q: videobuf2 queue * @read: mode selector (1 means read, 0 means write) */ static int __vb2_init_fileio(struct vb2_queue *q, int read) { struct vb2_fileio_data *fileio; struct vb2_buffer *vb; int i, ret; /* * Sanity check */ if (WARN_ON((read && !(q->io_modes & VB2_READ)) || (!read && !(q->io_modes & VB2_WRITE)))) return -EINVAL; /* * Check if device supports mapping buffers to kernel virtual space. */ if (!q->mem_ops->vaddr) return -EBUSY; /* * Check if streaming api has not been already activated. */ if (q->streaming || vb2_get_num_buffers(q) > 0) return -EBUSY; dprintk(q, 3, "setting up file io: mode %s, count %d, read_once %d, write_immediately %d\n", (read) ? "read" : "write", q->min_reqbufs_allocation, q->fileio_read_once, q->fileio_write_immediately); fileio = kzalloc(sizeof(*fileio), GFP_KERNEL); if (fileio == NULL) return -ENOMEM; fileio->read_once = q->fileio_read_once; fileio->write_immediately = q->fileio_write_immediately; /* * Request buffers and use MMAP type to force driver * to allocate buffers by itself. */ fileio->count = q->min_reqbufs_allocation; fileio->memory = VB2_MEMORY_MMAP; fileio->type = q->type; q->fileio = fileio; ret = vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); if (ret) goto err_kfree; /* vb2_fileio_data supports max VB2_MAX_FRAME buffers */ if (fileio->count > VB2_MAX_FRAME) { dprintk(q, 1, "fileio: more than VB2_MAX_FRAME buffers requested\n"); ret = -ENOSPC; goto err_reqbufs; } /* * Userspace can never add or delete buffers later, so there * will never be holes. It is safe to assume that vb2_get_buffer(q, 0) * will always return a valid vb pointer */ vb = vb2_get_buffer(q, 0); /* * Check if plane_count is correct * (multiplane buffers are not supported). */ if (vb->num_planes != 1) { ret = -EBUSY; goto err_reqbufs; } /* * Get kernel address of each buffer. */ for (i = 0; i < vb2_get_num_buffers(q); i++) { /* vb can never be NULL when using fileio. */ vb = vb2_get_buffer(q, i); fileio->bufs[i].vaddr = vb2_plane_vaddr(vb, 0); if (fileio->bufs[i].vaddr == NULL) { ret = -EINVAL; goto err_reqbufs; } fileio->bufs[i].size = vb2_plane_size(vb, 0); } /* * Read mode requires pre queuing of all buffers. */ if (read) { /* * Queue all buffers. */ for (i = 0; i < vb2_get_num_buffers(q); i++) { struct vb2_buffer *vb2 = vb2_get_buffer(q, i); if (!vb2) continue; ret = vb2_core_qbuf(q, vb2, NULL, NULL); if (ret) goto err_reqbufs; fileio->bufs[i].queued = 1; } /* * All buffers have been queued, so mark that by setting * initial_index to the number of buffers in the vb2_queue */ fileio->initial_index = vb2_get_num_buffers(q); fileio->cur_index = fileio->initial_index; } /* * Start streaming. */ ret = vb2_core_streamon(q, q->type); if (ret) goto err_reqbufs; return ret; err_reqbufs: fileio->count = 0; vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); err_kfree: q->fileio = NULL; kfree(fileio); return ret; } /* * __vb2_cleanup_fileio() - free resourced used by file io emulator * @q: videobuf2 queue */ static int __vb2_cleanup_fileio(struct vb2_queue *q) { struct vb2_fileio_data *fileio = q->fileio; if (fileio) { vb2_core_streamoff(q, q->type); q->fileio = NULL; fileio->count = 0; vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); kfree(fileio); dprintk(q, 3, "file io emulator closed\n"); } return 0; } /* * __vb2_perform_fileio() - perform a single file io (read or write) operation * @q: videobuf2 queue * @data: pointed to target userspace buffer * @count: number of bytes to read or write * @ppos: file handle position tracking pointer * @nonblock: mode selector (1 means blocking calls, 0 means nonblocking) * @read: access mode selector (1 means read, 0 means write) */ static size_t __vb2_perform_fileio(struct vb2_queue *q, char __user *data, size_t count, loff_t *ppos, int nonblock, int read) { struct vb2_fileio_data *fileio; struct vb2_fileio_buf *buf; bool is_multiplanar = q->is_multiplanar; /* * When using write() to write data to an output video node the vb2 core * should copy timestamps if V4L2_BUF_FLAG_TIMESTAMP_COPY is set. Nobody * else is able to provide this information with the write() operation. */ bool copy_timestamp = !read && q->copy_timestamp; unsigned index; int ret; dprintk(q, 3, "mode %s, offset %ld, count %zd, %sblocking\n", read ? "read" : "write", (long)*ppos, count, nonblock ? "non" : ""); if (!data) return -EINVAL; if (q->waiting_in_dqbuf) { dprintk(q, 3, "another dup()ped fd is %s\n", read ? "reading" : "writing"); return -EBUSY; } /* * Initialize emulator on first call. */ if (!vb2_fileio_is_active(q)) { ret = __vb2_init_fileio(q, read); dprintk(q, 3, "vb2_init_fileio result: %d\n", ret); if (ret) return ret; } fileio = q->fileio; /* * Check if we need to dequeue the buffer. */ index = fileio->cur_index; if (index >= vb2_get_num_buffers(q)) { struct vb2_buffer *b; /* * Call vb2_dqbuf to get buffer back. */ ret = vb2_core_dqbuf(q, &index, NULL, nonblock); dprintk(q, 5, "vb2_dqbuf result: %d\n", ret); if (ret) return ret; fileio->dq_count += 1; fileio->cur_index = index; buf = &fileio->bufs[index]; /* b can never be NULL when using fileio. */ b = vb2_get_buffer(q, index); /* * Get number of bytes filled by the driver */ buf->pos = 0; buf->queued = 0; buf->size = read ? vb2_get_plane_payload(b, 0) : vb2_plane_size(b, 0); /* Compensate for data_offset on read in the multiplanar case. */ if (is_multiplanar && read && b->planes[0].data_offset < buf->size) { buf->pos = b->planes[0].data_offset; buf->size -= buf->pos; } } else { buf = &fileio->bufs[index]; } /* * Limit count on last few bytes of the buffer. */ if (buf->pos + count > buf->size) { count = buf->size - buf->pos; dprintk(q, 5, "reducing read count: %zd\n", count); } /* * Transfer data to userspace. */ dprintk(q, 3, "copying %zd bytes - buffer %d, offset %u\n", count, index, buf->pos); if (read) ret = copy_to_user(data, buf->vaddr + buf->pos, count); else ret = copy_from_user(buf->vaddr + buf->pos, data, count); if (ret) { dprintk(q, 3, "error copying data\n"); return -EFAULT; } /* * Update counters. */ buf->pos += count; *ppos += count; /* * Queue next buffer if required. */ if (buf->pos == buf->size || (!read && fileio->write_immediately)) { /* b can never be NULL when using fileio. */ struct vb2_buffer *b = vb2_get_buffer(q, index); /* * Check if this is the last buffer to read. */ if (read && fileio->read_once && fileio->dq_count == 1) { dprintk(q, 3, "read limit reached\n"); return __vb2_cleanup_fileio(q); } /* * Call vb2_qbuf and give buffer to the driver. */ b->planes[0].bytesused = buf->pos; if (copy_timestamp) b->timestamp = ktime_get_ns(); ret = vb2_core_qbuf(q, b, NULL, NULL); dprintk(q, 5, "vb2_qbuf result: %d\n", ret); if (ret) return ret; /* * Buffer has been queued, update the status */ buf->pos = 0; buf->queued = 1; buf->size = vb2_plane_size(b, 0); fileio->q_count += 1; /* * If we are queuing up buffers for the first time, then * increase initial_index by one. */ if (fileio->initial_index < vb2_get_num_buffers(q)) fileio->initial_index++; /* * The next buffer to use is either a buffer that's going to be * queued for the first time (initial_index < number of buffers in the vb2_queue) * or it is equal to the number of buffers in the vb2_queue, * meaning that the next time we need to dequeue a buffer since * we've now queued up all the 'first time' buffers. */ fileio->cur_index = fileio->initial_index; } /* * Return proper number of bytes processed. */ if (ret == 0) ret = count; return ret; } size_t vb2_read(struct vb2_queue *q, char __user *data, size_t count, loff_t *ppos, int nonblocking) { return __vb2_perform_fileio(q, data, count, ppos, nonblocking, 1); } EXPORT_SYMBOL_GPL(vb2_read); size_t vb2_write(struct vb2_queue *q, const char __user *data, size_t count, loff_t *ppos, int nonblocking) { return __vb2_perform_fileio(q, (char __user *) data, count, ppos, nonblocking, 0); } EXPORT_SYMBOL_GPL(vb2_write); struct vb2_threadio_data { struct task_struct *thread; vb2_thread_fnc fnc; void *priv; bool stop; }; static int vb2_thread(void *data) { struct vb2_queue *q = data; struct vb2_threadio_data *threadio = q->threadio; bool copy_timestamp = false; unsigned prequeue = 0; unsigned index = 0; int ret = 0; if (q->is_output) { prequeue = vb2_get_num_buffers(q); copy_timestamp = q->copy_timestamp; } set_freezable(); for (;;) { struct vb2_buffer *vb; /* * Call vb2_dqbuf to get buffer back. */ if (prequeue) { vb = vb2_get_buffer(q, index++); if (!vb) continue; prequeue--; } else { if (!threadio->stop) { if (q->ops->wait_finish) call_void_qop(q, wait_finish, q); else if (q->lock) mutex_lock(q->lock); ret = vb2_core_dqbuf(q, &index, NULL, 0); if (q->ops->wait_prepare) call_void_qop(q, wait_prepare, q); else if (q->lock) mutex_unlock(q->lock); } dprintk(q, 5, "file io: vb2_dqbuf result: %d\n", ret); if (!ret) vb = vb2_get_buffer(q, index); } if (ret || threadio->stop) break; try_to_freeze(); if (vb->state != VB2_BUF_STATE_ERROR) if (threadio->fnc(vb, threadio->priv)) break; if (copy_timestamp) vb->timestamp = ktime_get_ns(); if (!threadio->stop) { if (q->ops->wait_finish) call_void_qop(q, wait_finish, q); else if (q->lock) mutex_lock(q->lock); ret = vb2_core_qbuf(q, vb, NULL, NULL); if (q->ops->wait_prepare) call_void_qop(q, wait_prepare, q); else if (q->lock) mutex_unlock(q->lock); } if (ret || threadio->stop) break; } /* Hmm, linux becomes *very* unhappy without this ... */ while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule(); } return 0; } /* * This function should not be used for anything else but the videobuf2-dvb * support. If you think you have another good use-case for this, then please * contact the linux-media mailinglist first. */ int vb2_thread_start(struct vb2_queue *q, vb2_thread_fnc fnc, void *priv, const char *thread_name) { struct vb2_threadio_data *threadio; int ret = 0; if (q->threadio) return -EBUSY; if (vb2_is_busy(q)) return -EBUSY; if (WARN_ON(q->fileio)) return -EBUSY; threadio = kzalloc(sizeof(*threadio), GFP_KERNEL); if (threadio == NULL) return -ENOMEM; threadio->fnc = fnc; threadio->priv = priv; ret = __vb2_init_fileio(q, !q->is_output); dprintk(q, 3, "file io: vb2_init_fileio result: %d\n", ret); if (ret) goto nomem; q->threadio = threadio; threadio->thread = kthread_run(vb2_thread, q, "vb2-%s", thread_name); if (IS_ERR(threadio->thread)) { ret = PTR_ERR(threadio->thread); threadio->thread = NULL; goto nothread; } return 0; nothread: __vb2_cleanup_fileio(q); nomem: kfree(threadio); return ret; } EXPORT_SYMBOL_GPL(vb2_thread_start); int vb2_thread_stop(struct vb2_queue *q) { struct vb2_threadio_data *threadio = q->threadio; int err; if (threadio == NULL) return 0; threadio->stop = true; /* Wake up all pending sleeps in the thread */ vb2_queue_error(q); err = kthread_stop(threadio->thread); __vb2_cleanup_fileio(q); threadio->thread = NULL; kfree(threadio); q->threadio = NULL; return err; } EXPORT_SYMBOL_GPL(vb2_thread_stop); MODULE_DESCRIPTION("Media buffer core framework"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>, Marek Szyprowski"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("DMA_BUF");
11 2 10 2 11 11 10 10 3 10 11 11 11 10 2 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/buffer.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * general buffer i/o */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/blkdev.h> #include "hpfs_fn.h" secno hpfs_search_hotfix_map(struct super_block *s, secno sec) { unsigned i; struct hpfs_sb_info *sbi = hpfs_sb(s); for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { if (sbi->hotfix_from[i] == sec) { return sbi->hotfix_to[i]; } } return sec; } unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n) { unsigned i; struct hpfs_sb_info *sbi = hpfs_sb(s); for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { if (sbi->hotfix_from[i] >= sec && sbi->hotfix_from[i] < sec + n) { n = sbi->hotfix_from[i] - sec; } } return n; } void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) { struct buffer_head *bh; struct blk_plug plug; if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) return; if (unlikely(hpfs_search_hotfix_map_for_range(s, secno, n) != n)) return; bh = sb_find_get_block(s, secno); if (bh) { if (buffer_uptodate(bh)) { brelse(bh); return; } brelse(bh); } blk_start_plug(&plug); while (n > 0) { if (unlikely(secno >= hpfs_sb(s)->sb_fs_size)) break; sb_breadahead(s, secno); secno++; n--; } blk_finish_plug(&plug); } /* Map a sector into a buffer and return pointers to it and to the buffer. */ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, int ahead) { struct buffer_head *bh; hpfs_lock_assert(s); hpfs_prefetch_sectors(s, secno, ahead); cond_resched(); *bhp = bh = sb_bread(s, hpfs_search_hotfix_map(s, secno)); if (bh != NULL) return bh->b_data; else { pr_err("%s(): read error\n", __func__); return NULL; } } /* Like hpfs_map_sector but don't read anything */ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp) { struct buffer_head *bh; /*return hpfs_map_sector(s, secno, bhp, 0);*/ hpfs_lock_assert(s); cond_resched(); if ((*bhp = bh = sb_getblk(s, hpfs_search_hotfix_map(s, secno))) != NULL) { if (!buffer_uptodate(bh)) wait_on_buffer(bh); set_buffer_uptodate(bh); return bh->b_data; } else { pr_err("%s(): getblk failed\n", __func__); return NULL; } } /* Map 4 sectors into a 4buffer and return pointers to it and to the buffer. */ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, int ahead) { char *data; hpfs_lock_assert(s); cond_resched(); if (secno & 3) { pr_err("%s(): unaligned read\n", __func__); return NULL; } hpfs_prefetch_sectors(s, secno, 4 + ahead); if (!hpfs_map_sector(s, secno + 0, &qbh->bh[0], 0)) goto bail0; if (!hpfs_map_sector(s, secno + 1, &qbh->bh[1], 0)) goto bail1; if (!hpfs_map_sector(s, secno + 2, &qbh->bh[2], 0)) goto bail2; if (!hpfs_map_sector(s, secno + 3, &qbh->bh[3], 0)) goto bail3; if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { return qbh->data = qbh->bh[0]->b_data; } qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { pr_err("%s(): out of memory\n", __func__); goto bail4; } memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512); memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512); memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512); memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512); return data; bail4: brelse(qbh->bh[3]); bail3: brelse(qbh->bh[2]); bail2: brelse(qbh->bh[1]); bail1: brelse(qbh->bh[0]); bail0: return NULL; } /* Don't read sectors */ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh) { cond_resched(); hpfs_lock_assert(s); if (secno & 3) { pr_err("%s(): unaligned read\n", __func__); return NULL; } if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0; if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1; if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2; if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3; if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) { return qbh->data = qbh->bh[0]->b_data; } if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { pr_err("%s(): out of memory\n", __func__); goto bail4; } return qbh->data; bail4: brelse(qbh->bh[3]); bail3: brelse(qbh->bh[2]); bail2: brelse(qbh->bh[1]); bail1: brelse(qbh->bh[0]); bail0: return NULL; } void hpfs_brelse4(struct quad_buffer_head *qbh) { if (unlikely(qbh->data != qbh->bh[0]->b_data)) kfree(qbh->data); brelse(qbh->bh[0]); brelse(qbh->bh[1]); brelse(qbh->bh[2]); brelse(qbh->bh[3]); } void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) { if (unlikely(qbh->data != qbh->bh[0]->b_data)) { memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512); memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512); memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); } mark_buffer_dirty(qbh->bh[0]); mark_buffer_dirty(qbh->bh[1]); mark_buffer_dirty(qbh->bh[2]); mark_buffer_dirty(qbh->bh[3]); }
4 4 4 4 2 4 5 3 4 1 4 4 1 4 4 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 /* * hw_random/core.c: HWRNG core API * * Copyright 2006 Michael Buesch <m@bues.ch> * Copyright 2005 (c) MontaVista Software, Inc. * * Please read Documentation/admin-guide/hw_random.rst for details on use. * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/delay.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/hw_random.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> #define RNG_MODULE_NAME "hw_random" #define RNG_BUFFER_SIZE (SMP_CACHE_BYTES < 32 ? 32 : SMP_CACHE_BYTES) static struct hwrng *current_rng; /* the current rng has been explicitly chosen by user via sysfs */ static int cur_rng_set_by_user; static struct task_struct *hwrng_fill; /* list of registered rngs */ static LIST_HEAD(rng_list); /* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); /* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; static unsigned short default_quality = 1024; /* default to maximum */ module_param(current_quality, ushort, 0644); MODULE_PARM_DESC(current_quality, "current hwrng entropy estimation per 1024 bits of input -- obsolete, use rng_quality instead"); module_param(default_quality, ushort, 0644); MODULE_PARM_DESC(default_quality, "default maximum entropy content of hwrng per 1024 bits of input"); static void drop_current_rng(void); static int hwrng_init(struct hwrng *rng); static int hwrng_fillfn(void *unused); static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait); static size_t rng_buffer_size(void) { return RNG_BUFFER_SIZE; } static inline void cleanup_rng(struct kref *kref) { struct hwrng *rng = container_of(kref, struct hwrng, ref); if (rng->cleanup) rng->cleanup(rng); complete(&rng->cleanup_done); } static int set_current_rng(struct hwrng *rng) { int err; BUG_ON(!mutex_is_locked(&rng_mutex)); err = hwrng_init(rng); if (err) return err; drop_current_rng(); current_rng = rng; /* if necessary, start hwrng thread */ if (!hwrng_fill) { hwrng_fill = kthread_run(hwrng_fillfn, NULL, "hwrng"); if (IS_ERR(hwrng_fill)) { pr_err("hwrng_fill thread creation failed\n"); hwrng_fill = NULL; } } return 0; } static void drop_current_rng(void) { BUG_ON(!mutex_is_locked(&rng_mutex)); if (!current_rng) return; /* decrease last reference for triggering the cleanup */ kref_put(&current_rng->ref, cleanup_rng); current_rng = NULL; } /* Returns ERR_PTR(), NULL or refcounted hwrng */ static struct hwrng *get_current_rng_nolock(void) { if (current_rng) kref_get(&current_rng->ref); return current_rng; } static struct hwrng *get_current_rng(void) { struct hwrng *rng; if (mutex_lock_interruptible(&rng_mutex)) return ERR_PTR(-ERESTARTSYS); rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); return rng; } static void put_rng(struct hwrng *rng) { /* * Hold rng_mutex here so we serialize in case they set_current_rng * on rng again immediately. */ mutex_lock(&rng_mutex); if (rng) kref_put(&rng->ref, cleanup_rng); mutex_unlock(&rng_mutex); } static int hwrng_init(struct hwrng *rng) { if (kref_get_unless_zero(&rng->ref)) goto skip_init; if (rng->init) { int ret; ret = rng->init(rng); if (ret) return ret; } kref_init(&rng->ref); reinit_completion(&rng->cleanup_done); skip_init: current_quality = rng->quality; /* obsolete */ return 0; } static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ if ((filp->f_mode & FMODE_READ) == 0) return -EINVAL; if (filp->f_mode & FMODE_WRITE) return -EINVAL; return 0; } static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; BUG_ON(!mutex_is_locked(&reading_mutex)); if (rng->read) { int err; err = rng->read(rng, buffer, size, wait); if (WARN_ON_ONCE(err > 0 && err > size)) err = size; return err; } if (rng->data_present) present = rng->data_present(rng, wait); else present = 1; if (present) return rng->data_read(rng, (u32 *)buffer); return 0; } static ssize_t rng_dev_read(struct file *filp, char __user *buf, size_t size, loff_t *offp) { u8 buffer[RNG_BUFFER_SIZE]; ssize_t ret = 0; int err = 0; int bytes_read, len; struct hwrng *rng; while (size) { rng = get_current_rng(); if (IS_ERR(rng)) { err = PTR_ERR(rng); goto out; } if (!rng) { err = -ENODEV; goto out; } if (mutex_lock_interruptible(&reading_mutex)) { err = -ERESTARTSYS; goto out_put; } if (!data_avail) { bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp->f_flags & O_NONBLOCK)); if (bytes_read < 0) { err = bytes_read; goto out_unlock_reading; } else if (bytes_read == 0 && (filp->f_flags & O_NONBLOCK)) { err = -EAGAIN; goto out_unlock_reading; } data_avail = bytes_read; } len = data_avail; if (len) { if (len > size) len = size; data_avail -= len; memcpy(buffer, rng_buffer + data_avail, len); } mutex_unlock(&reading_mutex); put_rng(rng); if (len) { if (copy_to_user(buf + ret, buffer, len)) { err = -EFAULT; goto out; } size -= len; ret += len; } if (need_resched()) schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; goto out; } } out: memzero_explicit(buffer, sizeof(buffer)); return ret ? : err; out_unlock_reading: mutex_unlock(&reading_mutex); out_put: put_rng(rng); goto out; } static const struct file_operations rng_chrdev_ops = { .owner = THIS_MODULE, .open = rng_dev_open, .read = rng_dev_read, .llseek = noop_llseek, }; static const struct attribute_group *rng_dev_groups[]; static struct miscdevice rng_miscdev = { .minor = HWRNG_MINOR, .name = RNG_MODULE_NAME, .nodename = "hwrng", .fops = &rng_chrdev_ops, .groups = rng_dev_groups, }; static int enable_best_rng(void) { struct hwrng *rng, *new_rng = NULL; int ret = -ENODEV; BUG_ON(!mutex_is_locked(&rng_mutex)); /* no rng to use? */ if (list_empty(&rng_list)) { drop_current_rng(); cur_rng_set_by_user = 0; return 0; } /* use the rng which offers the best quality */ list_for_each_entry(rng, &rng_list, list) { if (!new_rng || rng->quality > new_rng->quality) new_rng = rng; } ret = ((new_rng == current_rng) ? 0 : set_current_rng(new_rng)); if (!ret) cur_rng_set_by_user = 0; return ret; } static ssize_t rng_current_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int err; struct hwrng *rng, *new_rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; if (sysfs_streq(buf, "")) { err = enable_best_rng(); } else { list_for_each_entry(rng, &rng_list, list) { if (sysfs_streq(rng->name, buf)) { err = set_current_rng(rng); if (!err) cur_rng_set_by_user = 1; break; } } } new_rng = get_current_rng_nolock(); mutex_unlock(&rng_mutex); if (new_rng) put_rng(new_rng); return err ? : len; } static ssize_t rng_current_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng)) return PTR_ERR(rng); ret = sysfs_emit(buf, "%s\n", rng ? rng->name : "none"); put_rng(rng); return ret; } static ssize_t rng_available_show(struct device *dev, struct device_attribute *attr, char *buf) { int err; struct hwrng *rng; err = mutex_lock_interruptible(&rng_mutex); if (err) return -ERESTARTSYS; buf[0] = '\0'; list_for_each_entry(rng, &rng_list, list) { strlcat(buf, rng->name, PAGE_SIZE); strlcat(buf, " ", PAGE_SIZE); } strlcat(buf, "\n", PAGE_SIZE); mutex_unlock(&rng_mutex); return strlen(buf); } static ssize_t rng_selected_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", cur_rng_set_by_user); } static ssize_t rng_quality_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng)) return PTR_ERR(rng); if (!rng) /* no need to put_rng */ return -ENODEV; ret = sysfs_emit(buf, "%hu\n", rng->quality); put_rng(rng); return ret; } static ssize_t rng_quality_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u16 quality; int ret = -EINVAL; if (len < 2) return -EINVAL; ret = mutex_lock_interruptible(&rng_mutex); if (ret) return -ERESTARTSYS; ret = kstrtou16(buf, 0, &quality); if (ret || quality > 1024) { ret = -EINVAL; goto out; } if (!current_rng) { ret = -ENODEV; goto out; } current_rng->quality = quality; current_quality = quality; /* obsolete */ /* the best available RNG may have changed */ ret = enable_best_rng(); out: mutex_unlock(&rng_mutex); return ret ? ret : len; } static DEVICE_ATTR_RW(rng_current); static DEVICE_ATTR_RO(rng_available); static DEVICE_ATTR_RO(rng_selected); static DEVICE_ATTR_RW(rng_quality); static struct attribute *rng_dev_attrs[] = { &dev_attr_rng_current.attr, &dev_attr_rng_available.attr, &dev_attr_rng_selected.attr, &dev_attr_rng_quality.attr, NULL }; ATTRIBUTE_GROUPS(rng_dev); static int hwrng_fillfn(void *unused) { size_t entropy, entropy_credit = 0; /* in 1/1024 of a bit */ long rc; while (!kthread_should_stop()) { unsigned short quality; struct hwrng *rng; rng = get_current_rng(); if (IS_ERR(rng) || !rng) break; mutex_lock(&reading_mutex); rc = rng_get_data(rng, rng_fillbuf, rng_buffer_size(), 1); if (current_quality != rng->quality) rng->quality = current_quality; /* obsolete */ quality = rng->quality; mutex_unlock(&reading_mutex); if (rc <= 0) hwrng_msleep(rng, 10000); put_rng(rng); if (rc <= 0) continue; /* If we cannot credit at least one bit of entropy, * keep track of the remainder for the next iteration */ entropy = rc * quality * 8 + entropy_credit; if ((entropy >> 10) == 0) entropy_credit = entropy; /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, entropy >> 10, true); } hwrng_fill = NULL; return 0; } int hwrng_register(struct hwrng *rng) { int err = -EINVAL; struct hwrng *tmp; if (!rng->name || (!rng->data_read && !rng->read)) goto out; mutex_lock(&rng_mutex); /* Must not register two RNGs with the same name. */ err = -EEXIST; list_for_each_entry(tmp, &rng_list, list) { if (strcmp(tmp->name, rng->name) == 0) goto out_unlock; } list_add_tail(&rng->list, &rng_list); init_completion(&rng->cleanup_done); complete(&rng->cleanup_done); init_completion(&rng->dying); /* Adjust quality field to always have a proper value */ rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); if (!current_rng || (!cur_rng_set_by_user && rng->quality > current_rng->quality)) { /* * Set new rng as current as the new rng source * provides better entropy quality and was not * chosen by userspace. */ err = set_current_rng(rng); if (err) goto out_unlock; } mutex_unlock(&rng_mutex); return 0; out_unlock: mutex_unlock(&rng_mutex); out: return err; } EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { struct hwrng *new_rng; int err; mutex_lock(&rng_mutex); list_del(&rng->list); complete_all(&rng->dying); if (current_rng == rng) { err = enable_best_rng(); if (err) { drop_current_rng(); cur_rng_set_by_user = 0; } } new_rng = get_current_rng_nolock(); if (list_empty(&rng_list)) { mutex_unlock(&rng_mutex); if (hwrng_fill) kthread_stop(hwrng_fill); } else mutex_unlock(&rng_mutex); if (new_rng) put_rng(new_rng); wait_for_completion(&rng->cleanup_done); } EXPORT_SYMBOL_GPL(hwrng_unregister); static void devm_hwrng_release(struct device *dev, void *res) { hwrng_unregister(*(struct hwrng **)res); } static int devm_hwrng_match(struct device *dev, void *res, void *data) { struct hwrng **r = res; if (WARN_ON(!r || !*r)) return 0; return *r == data; } int devm_hwrng_register(struct device *dev, struct hwrng *rng) { struct hwrng **ptr; int error; ptr = devres_alloc(devm_hwrng_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; error = hwrng_register(rng); if (error) { devres_free(ptr); return error; } *ptr = rng; devres_add(dev, ptr); return 0; } EXPORT_SYMBOL_GPL(devm_hwrng_register); void devm_hwrng_unregister(struct device *dev, struct hwrng *rng) { devres_release(dev, devm_hwrng_release, devm_hwrng_match, rng); } EXPORT_SYMBOL_GPL(devm_hwrng_unregister); long hwrng_msleep(struct hwrng *rng, unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; return wait_for_completion_interruptible_timeout(&rng->dying, timeout); } EXPORT_SYMBOL_GPL(hwrng_msleep); long hwrng_yield(struct hwrng *rng) { return wait_for_completion_interruptible_timeout(&rng->dying, 1); } EXPORT_SYMBOL_GPL(hwrng_yield); static int __init hwrng_modinit(void) { int ret; /* kmalloc makes this safe for virt_to_page() in virtio_rng.c */ rng_buffer = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_buffer) return -ENOMEM; rng_fillbuf = kmalloc(rng_buffer_size(), GFP_KERNEL); if (!rng_fillbuf) { kfree(rng_buffer); return -ENOMEM; } ret = misc_register(&rng_miscdev); if (ret) { kfree(rng_fillbuf); kfree(rng_buffer); } return ret; } static void __exit hwrng_modexit(void) { mutex_lock(&rng_mutex); BUG_ON(current_rng); kfree(rng_buffer); kfree(rng_fillbuf); mutex_unlock(&rng_mutex); misc_deregister(&rng_miscdev); } fs_initcall(hwrng_modinit); /* depends on misc_register() */ module_exit(hwrng_modexit); MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); MODULE_LICENSE("GPL");
19 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 7 7 1 1 173 173 172 173 173 171 173 172 171 173 171 20 1 5 5 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 3 1 1 4 17 17 17 17 9 8 8 1 8 5 1 10 2 2 2 2 3 12 12 8 10 10 8 4 8 2 6 4 6 15 15 2 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) { /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ return ss->legacy_cftypes == NULL && ss->dfl_cftypes; } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(true); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(true); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] < pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name) || cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } /** * task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @tsk: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returned. On failure, ERR_PTR is returned. * We limit it to cgroup1 only. */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) { struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup_root *root; unsigned long flags; rcu_read_lock(); for_each_root(root) { /* cgroup1 only*/ if (root == &cgrp_dfl_root) continue; if (root->hierarchy_id != hierarchy_id) continue; spin_lock_irqsave(&css_set_lock, flags); cgrp = task_cgroup_from_root(tsk, root); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); spin_unlock_irqrestore(&css_set_lock, flags); break; } rcu_read_unlock(); return cgrp; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 0, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; break; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1);
233 2 512 3389 233 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; struct ns_common; /* * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); #else static inline void bdev_cache_init(void) { } #endif /* CONFIG_BLOCK */ /* * buffer.c */ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * fs_context.c */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); /* * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); /* * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); int mnt_get_write_access_file(struct file *file); void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void file_put_write_access(struct file *file) { put_write_access(file->f_inode); mnt_put_write_access(file->f_path.mnt); if (unlikely(file->f_mode & FMODE_BACKING)) mnt_put_write_access(backing_file_user_path(file)->mnt); } static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { file_put_write_access(file); } } /* * super.c */ extern int reconfigure_super(struct fs_context *); extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* * Prepare superblock for changing its read-only state (i.e., either remount * read-write superblock read-only or vice versa). After this function returns * mnt_is_readonly() will return true for any mount of the superblock if its * caller is able to observe any changes done by the remount. This holds until * sb_end_ro_state_change() is called. */ static inline void sb_start_ro_state_change(struct super_block *sb) { WRITE_ONCE(sb->s_readonly_remount, 1); /* * For RO->RW transition, the barrier pairs with the barrier in * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in * mnt_get_write_access() before the mnt_is_readonly() check. * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } /* * Ends section changing read-only state of the superblock. After this function * returns if mnt_is_readonly() returns false, the caller will be able to * observe all the changes remount did to the superblock. */ static inline void sb_end_ro_state_change(struct super_block *sb) { /* * This barrier provides release semantics that pairs with * the smp_rmb() acquire semantics in mnt_is_readonly(). * This barrier pair ensure that when mnt_is_readonly() sees * 0 for sb->s_readonly_remount, it will also see all the * preceding flag changes that were made during the RO state * change. */ smp_wmb(); WRITE_ONCE(sb->s_readonly_remount, 0); } /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); extern int vfs_open(const struct path *, struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c */ extern long get_nr_dirty_inodes(void); void invalidate_inodes(struct super_block *sb); /* * dcache.c */ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); extern void d_genocide(struct dentry *); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; int open_namespace(struct ns_common *ns); /* * fs/stat.c: */ int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); int do_statx_fd(int fd, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* * fs/splice.c: */ ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); /* * fs/xattr.c: */ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; void __user *value; }; void *kvalue; size_t size; /* Attribute name */ struct xattr_name *kname; unsigned int flags; }; ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx); ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx); int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); int import_xattr_name(struct xattr_name *kname, const char __user *name); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { return -EOPNOTSUPP; } #endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); /* * fs/attr.c */ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); /** * path_mounted - check whether path is mounted * @path: path to check * * Determine whether @path refers to the root of a mount. * * Return: true if @path is the root of a mount, false if not. */ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file);
35 221 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Authentication token and access key management * * Copyright (C) 2004, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/core.rst for information on keys/keyrings. */ #ifndef _LINUX_KEY_H #define _LINUX_KEY_H #include <linux/types.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/assoc_array.h> #include <linux/refcount.h> #include <linux/time64.h> #ifdef __KERNEL__ #include <linux/uidgid.h> /* key handle serial number */ typedef int32_t key_serial_t; /* key handle permissions mask */ typedef uint32_t key_perm_t; struct key; struct net; #ifdef CONFIG_KEYS #undef KEY_DEBUGGING #define KEY_POS_VIEW 0x01000000 /* possessor can view a key's attributes */ #define KEY_POS_READ 0x02000000 /* possessor can read key payload / view keyring */ #define KEY_POS_WRITE 0x04000000 /* possessor can update key payload / add link to keyring */ #define KEY_POS_SEARCH 0x08000000 /* possessor can find a key in search / search a keyring */ #define KEY_POS_LINK 0x10000000 /* possessor can create a link to a key/keyring */ #define KEY_POS_SETATTR 0x20000000 /* possessor can set key attributes */ #define KEY_POS_ALL 0x3f000000 #define KEY_USR_VIEW 0x00010000 /* user permissions... */ #define KEY_USR_READ 0x00020000 #define KEY_USR_WRITE 0x00040000 #define KEY_USR_SEARCH 0x00080000 #define KEY_USR_LINK 0x00100000 #define KEY_USR_SETATTR 0x00200000 #define KEY_USR_ALL 0x003f0000 #define KEY_GRP_VIEW 0x00000100 /* group permissions... */ #define KEY_GRP_READ 0x00000200 #define KEY_GRP_WRITE 0x00000400 #define KEY_GRP_SEARCH 0x00000800 #define KEY_GRP_LINK 0x00001000 #define KEY_GRP_SETATTR 0x00002000 #define KEY_GRP_ALL 0x00003f00 #define KEY_OTH_VIEW 0x00000001 /* third party permissions... */ #define KEY_OTH_READ 0x00000002 #define KEY_OTH_WRITE 0x00000004 #define KEY_OTH_SEARCH 0x00000008 #define KEY_OTH_LINK 0x00000010 #define KEY_OTH_SETATTR 0x00000020 #define KEY_OTH_ALL 0x0000003f #define KEY_PERM_UNDEF 0xffffffff /* * The permissions required on a key that we're looking up. */ enum key_need_perm { KEY_NEED_UNSPECIFIED, /* Needed permission unspecified */ KEY_NEED_VIEW, /* Require permission to view attributes */ KEY_NEED_READ, /* Require permission to read content */ KEY_NEED_WRITE, /* Require permission to update / modify */ KEY_NEED_SEARCH, /* Require permission to search (keyring) or find (key) */ KEY_NEED_LINK, /* Require permission to link */ KEY_NEED_SETATTR, /* Require permission to change attributes */ KEY_NEED_UNLINK, /* Require permission to unlink key */ KEY_SYSADMIN_OVERRIDE, /* Special: override by CAP_SYS_ADMIN */ KEY_AUTHTOKEN_OVERRIDE, /* Special: override by possession of auth token */ KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; enum key_lookup_flag { KEY_LOOKUP_CREATE = 0x01, KEY_LOOKUP_PARTIAL = 0x02, KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), }; struct seq_file; struct user_struct; struct signal_struct; struct cred; struct key_type; struct key_owner; struct key_tag; struct keyring_list; struct keyring_name; struct key_tag { struct rcu_head rcu; refcount_t usage; bool removed; /* T when subject removed */ }; struct keyring_index_key { /* [!] If this structure is altered, the union in struct key must change too! */ unsigned long hash; /* Hash value */ union { struct { #ifdef __LITTLE_ENDIAN /* Put desc_len at the LSB of x */ u16 desc_len; char desc[sizeof(long) - 2]; /* First few chars of description */ #else char desc[sizeof(long) - 2]; /* First few chars of description */ u16 desc_len; #endif }; unsigned long x; }; struct key_type *type; struct key_tag *domain_tag; /* Domain of operation */ const char *description; }; union key_payload { void __rcu *rcu_data0; void *data[4]; }; /*****************************************************************************/ /* * key reference with possession attribute handling * * NOTE! key_ref_t is a typedef'd pointer to a type that is not actually * defined. This is because we abuse the bottom bit of the reference to carry a * flag to indicate whether the calling process possesses that key in one of * its keyrings. * * the key_ref_t has been made a separate type so that the compiler can reject * attempts to dereference it without proper conversion. * * the three functions are used to assemble and disassemble references */ typedef struct __key_reference_with_attributes *key_ref_t; static inline key_ref_t make_key_ref(const struct key *key, bool possession) { return (key_ref_t) ((unsigned long) key | possession); } static inline struct key *key_ref_to_ptr(const key_ref_t key_ref) { return (struct key *) ((unsigned long) key_ref & ~1UL); } static inline bool is_key_possessed(const key_ref_t key_ref) { return (unsigned long) key_ref & 1UL; } typedef int (*key_restrict_link_func_t)(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); struct key_restriction { key_restrict_link_func_t check; struct key *key; struct key_type *keytype; }; enum key_state { KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE, /* Positively instantiated */ }; /*****************************************************************************/ /* * authentication token / access credential / keyring * - types of key include: * - keyrings * - disk encryption IDs * - Kerberos TGTs and tickets */ struct key { refcount_t usage; /* number of references */ key_serial_t serial; /* key serial number */ union { struct list_head graveyard_link; struct rb_node serial_node; }; #ifdef CONFIG_KEY_NOTIFICATIONS struct watch_list *watchers; /* Entities watching this key for changes */ #endif struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ union { time64_t expiry; /* time at which key expires (or 0) */ time64_t revoked_at; /* time at which key was revoked */ }; time64_t last_used_at; /* last time used for LRU keyring discard */ kuid_t uid; kgid_t gid; key_perm_t perm; /* access permissions */ unsigned short quotalen; /* length added to quota */ unsigned short datalen; /* payload data length * - may not match RCU dereferenced payload * - payload should contain own length */ short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; #define KEY_DEBUG_MAGIC 0x18273645u #endif unsigned long flags; /* status flags (change with bitops) */ #define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ #define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ #define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ #define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ #define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ #define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ #define KEY_FLAG_BUILTIN 6 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria * - it should be a printable string * - eg: for krb5 AFS, this might be "afs@REDHAT.COM" */ union { struct keyring_index_key index_key; struct { unsigned long hash; unsigned long len_desc; struct key_type *type; /* type of key */ struct key_tag *domain_tag; /* Domain of operation */ char *description; }; }; /* key data * - this is used to hold the data actually used in cryptography or * whatever */ union { union key_payload payload; struct { /* Keyring bits */ struct list_head name_link; struct assoc_array keys; }; }; /* This is set on a keyring to restrict the addition of a link to a key * to it. If this structure isn't provided then it is assumed that the * keyring is open to any addition. It is ignored for non-keyring * keys. Only set this value using keyring_restrict(), keyring_alloc(), * or key_alloc(). * * This is intended for use with rings of trusted keys whereby addition * to the keyring needs to be controlled. KEY_ALLOC_BYPASS_RESTRICTION * overrides this, allowing the kernel to add extra keys without * restriction. */ struct key_restriction *restrict_link; }; extern struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link); #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ #define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ #define KEY_ALLOC_SET_KEEP 0x0020 /* Set the KEEP flag on the key/keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); extern void key_put(struct key *key); extern bool key_put_tag(struct key_tag *tag); extern void key_remove_domain(struct key_tag *domain_tag); static inline struct key *__key_get(struct key *key) { refcount_inc(&key->usage); return key; } static inline struct key *key_get(struct key *key) { return key ? __key_get(key) : key; } static inline void key_ref_put(key_ref_t key_ref) { key_put(key_ref_to_ptr(key_ref)); } extern struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info); extern struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag); extern struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_tag(), but with the default global domain tag. */ static inline struct key *request_key(struct key_type *type, const char *description, const char *callout_info) { return request_key_tag(type, description, NULL, callout_info); } #ifdef CONFIG_NET /** * request_key_net - Request a key for a net namespace and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key() except that it does not add the returned key to a * keyring if found, new keys are always allocated in the user's quota, the * callout_info must be a NUL-terminated string and no auxiliary data can be * passed. Only keys that operate the specified network namespace are used. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ #define request_key_net(type, description, net, callout_info) \ request_key_tag(type, description, net->key_domain, callout_info) /** * request_key_net_rcu - Request a key for a net namespace under RCU conditions * @type: Type of key. * @description: The searchable description of the key. * @net: The network namespace that is the key's domain of operation. * * As for request_key_rcu() except that only keys that operate the specified * network namespace are used. */ #define request_key_net_rcu(type, description, net) \ request_key_rcu(type, description, net->key_domain) #endif /* CONFIG_NET */ extern int wait_for_key_construction(struct key *key, bool intr); extern int key_validate(const struct key *key); extern key_ref_t key_create(key_ref_t keyring, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags); extern key_ref_t key_create_or_update(key_ref_t keyring, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags); extern int key_update(key_ref_t key, const void *payload, size_t plen); extern int key_link(struct key *keyring, struct key *key); extern int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags); extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest); extern int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key); extern int keyring_clear(struct key *keyring); extern key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse); extern int keyring_restrict(key_ref_t keyring, const char *type, const char *restriction); extern struct key *key_lookup(key_serial_t id); static inline key_serial_t key_serial(const struct key *key) { return key ? key->serial : 0; } extern void key_set_timeout(struct key *, unsigned); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, enum key_need_perm need_perm); extern void key_free_user_ns(struct user_namespace *); static inline short key_read_state(const struct key *key) { /* Barrier versus mark_key_instantiated(). */ return smp_load_acquire(&key->state); } /** * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ static inline bool key_is_positive(const struct key *key) { return key_read_state(key) == KEY_IS_POSITIVE; } static inline bool key_is_negative(const struct key *key) { return key_read_state(key) < 0; } #define dereference_key_rcu(KEY) \ (rcu_dereference((KEY)->payload.rcu_data0)) #define dereference_key_locked(KEY) \ (rcu_dereference_protected((KEY)->payload.rcu_data0, \ rwsem_is_locked(&((struct key *)(KEY))->sem))) #define rcu_assign_keypointer(KEY, PAYLOAD) \ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD)); \ } while (0) /* * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); extern void key_fsuid_changed(struct cred *new_cred); extern void key_fsgid_changed(struct cred *new_cred); extern void key_init(void); #else /* CONFIG_KEYS */ #define key_validate(k) 0 #define key_serial(k) 0 #define key_get(k) ({ NULL; }) #define key_revoke(k) do { } while(0) #define key_invalidate(k) do { } while(0) #define key_put(k) do { } while(0) #define key_ref_put(k) do { } while(0) #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 #define key_fsuid_changed(c) do { } while(0) #define key_fsgid_changed(c) do { } while(0) #define key_init() do { } while(0) #define key_free_user_ns(ns) do { } while(0) #define key_remove_domain(d) do { } while(0) #define key_lookup(k) NULL #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ #endif /* _LINUX_KEY_H */
51 2 2 2 6 1 1 1 1 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 // SPDX-License-Identifier: GPL-2.0 #include "io_uring.h" #include "napi.h" #ifdef CONFIG_NET_RX_BUSY_POLL /* Timeout for cleanout of stale entries. */ #define NAPI_TIMEOUT (60 * SEC_CONVERSION) struct io_napi_entry { unsigned int napi_id; struct list_head list; unsigned long timeout; struct hlist_node node; struct rcu_head rcu; }; static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, unsigned int napi_id) { struct io_napi_entry *e; hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; return e; } return NULL; } static inline ktime_t net_to_ktime(unsigned long t) { /* napi approximating usecs, reverse busy_loop_current_time */ return ns_to_ktime(t << 10); } int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ if (napi_id < MIN_NAPI_ID) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; scoped_guard(rcu) { e = io_napi_hash_find(hash_list, napi_id); if (e) { WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); return -EEXIST; } } e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) return -ENOMEM; e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; /* * guard(spinlock) is not used to manually unlock it before calling * kfree() */ spin_lock(&ctx->napi_lock); if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); return -EEXIST; } hlist_add_tail_rcu(&e->node, hash_list); list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); return 0; } static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ if (napi_id < MIN_NAPI_ID) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; guard(spinlock)(&ctx->napi_lock); e = io_napi_hash_find(hash_list, napi_id); if (!e) return -ENOENT; list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); return 0; } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; guard(spinlock)(&ctx->napi_lock); /* * list_for_each_entry_safe() is not required as long as: * 1. list_del_rcu() does not reset the deleted node next pointer * 2. kfree_rcu() delays the memory freeing until the next quiescent * state */ list_for_each_entry(e, &ctx->napi_list, list) { if (time_after(jiffies, READ_ONCE(e->timeout))) { list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); } } } static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) { if (is_stale) __io_napi_remove_stale(ctx); } static inline bool io_napi_busy_loop_timeout(ktime_t start_time, ktime_t bp) { if (bp) { ktime_t end_time = ktime_add(start_time, bp); ktime_t now = net_to_ktime(busy_loop_current_time()); return ktime_after(now, end_time); } return true; } static bool io_napi_busy_loop_should_end(void *data, unsigned long start_time) { struct io_wait_queue *iowq = data; if (signal_pending(current)) return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; if (io_napi_busy_loop_timeout(net_to_ktime(start_time), iowq->napi_busy_poll_dt)) return true; return false; } /* * never report stale entries */ static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { struct io_napi_entry *e; list_for_each_entry_rcu(e, &ctx->napi_list, list) napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); return false; } static bool dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { struct io_napi_entry *e; bool is_stale = false; list_for_each_entry_rcu(e, &ctx->napi_list, list) { napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); if (time_after(jiffies, READ_ONCE(e->timeout))) is_stale = true; } return is_stale; } static inline bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { unsigned long start_time = busy_loop_current_time(); bool (*loop_end)(void *, unsigned long) = NULL; void *loop_end_arg = NULL; bool is_stale = false; /* Singular lists use a different napi loop end check function and are * only executed once. */ if (list_is_singular(&ctx->napi_list)) { loop_end = io_napi_busy_loop_should_end; loop_end_arg = iowq; } scoped_guard(rcu) { do { is_stale = __io_napi_do_busy_loop(ctx, loop_end, loop_end_arg); } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); } io_napi_remove_stale(ctx, is_stale); } /* * io_napi_init() - Init napi settings * @ctx: pointer to io-uring context structure * * Init napi settings in the io-uring context. */ void io_napi_init(struct io_ring_ctx *ctx) { u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; } /* * io_napi_free() - Deallocate napi * @ctx: pointer to io-uring context structure * * Free the napi list and the hash table in the io-uring context. */ void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; guard(spinlock)(&ctx->napi_lock); list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } INIT_LIST_HEAD_RCU(&ctx->napi_list); } static int io_napi_register_napi(struct io_ring_ctx *ctx, struct io_uring_napi *napi) { switch (napi->op_param) { case IO_URING_NAPI_TRACKING_DYNAMIC: case IO_URING_NAPI_TRACKING_STATIC: break; default: return -EINVAL; } /* clean the napi list for new settings */ io_napi_free(ctx); WRITE_ONCE(ctx->napi_track_mode, napi->op_param); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); return 0; } /* * io_napi_register() - Register napi with io-uring * @ctx: pointer to io-uring context structure * @arg: pointer to io_uring_napi structure * * Register napi in the io-uring context. */ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll, .op_param = ctx->napi_track_mode }; struct io_uring_napi napi; if (ctx->flags & IORING_SETUP_IOPOLL) return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.resv) return -EINVAL; if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; switch (napi.opcode) { case IO_URING_NAPI_REGISTER_OP: return io_napi_register_napi(ctx, &napi); case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; return __io_napi_add_id(ctx, napi.op_param); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; return __io_napi_del_id(ctx, napi.op_param); default: return -EINVAL; } } /* * io_napi_unregister() - Unregister napi with io-uring * @ctx: pointer to io-uring context structure * @arg: pointer to io_uring_napi structure * * Unregister napi. If arg has been specified copy the busy poll timeout and * prefer busy poll setting to the passed in structure. */ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); return 0; } /* * __io_napi_busy_loop() - execute busy poll loop * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue * * Execute the busy poll loop and merge the spliced off list. */ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (ctx->flags & IORING_SETUP_SQPOLL) return; iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); if (iowq->timeout != KTIME_MAX) { ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); } iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); io_napi_blocking_busy_loop(ctx, iowq); } /* * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll * @ctx: pointer to io-uring context structure * * Splice of the napi list and execute the napi busy poll loop. */ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { bool is_stale = false; if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; scoped_guard(rcu) { is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); } io_napi_remove_stale(ctx, is_stale); return 1; } #endif
8 8 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 // SPDX-License-Identifier: GPL-2.0-or-later /* AFS vlserver list management. * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/kernel.h> #include <linux/slab.h> #include "internal.h" struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, unsigned short port) { struct afs_vlserver *vlserver; static atomic_t debug_ids; vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), GFP_KERNEL); if (vlserver) { refcount_set(&vlserver->ref, 1); rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); vlserver->debug_id = atomic_inc_return(&debug_ids); vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; vlserver->service_id = VL_SERVICE; vlserver->port = port; memcpy(vlserver->name, name, name_len); } return vlserver; } static void afs_vlserver_rcu(struct rcu_head *rcu) { struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); afs_put_addrlist(rcu_access_pointer(vlserver->addresses), afs_alist_trace_put_vlserver); kfree_rcu(vlserver, rcu); } void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver) { if (vlserver && refcount_dec_and_test(&vlserver->ref)) call_rcu(&vlserver->rcu, afs_vlserver_rcu); } struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers) { struct afs_vlserver_list *vllist; vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL); if (vllist) { refcount_set(&vllist->ref, 1); rwlock_init(&vllist->lock); } return vllist; } void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist) { if (vllist) { if (refcount_dec_and_test(&vllist->ref)) { int i; for (i = 0; i < vllist->nr_servers; i++) { afs_put_vlserver(net, vllist->servers[i].server); } kfree_rcu(vllist, rcu); } } } static u16 afs_extract_le16(const u8 **_b) { u16 val; val = (u16)*(*_b)++ << 0; val |= (u16)*(*_b)++ << 8; return val; } /* * Build a VL server address list from a DNS queried server list. */ static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, const u8 **_b, const u8 *end, u8 nr_addrs, u16 port) { struct afs_addr_list *alist; const u8 *b = *_b; int ret = -EINVAL; alist = afs_alloc_addrlist(nr_addrs); if (!alist) return ERR_PTR(-ENOMEM); if (nr_addrs == 0) return alist; for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) { struct dns_server_list_v1_address hdr; __be32 x[4]; hdr.address_type = *b++; switch (hdr.address_type) { case DNS_ADDRESS_IS_IPV4: if (end - b < 4) { _leave(" = -EINVAL [short inet]"); goto error; } memcpy(x, b, 4); ret = afs_merge_fs_addr4(net, alist, x[0], port); if (ret < 0) goto error; b += 4; break; case DNS_ADDRESS_IS_IPV6: if (end - b < 16) { _leave(" = -EINVAL [short inet6]"); goto error; } memcpy(x, b, 16); ret = afs_merge_fs_addr6(net, alist, x, port); if (ret < 0) goto error; b += 16; break; default: _leave(" = -EADDRNOTAVAIL [unknown af %u]", hdr.address_type); ret = -EADDRNOTAVAIL; goto error; } } /* Start with IPv6 if available. */ if (alist->nr_ipv4 < alist->nr_addrs) alist->preferred = alist->nr_ipv4; *_b = b; return alist; error: *_b = b; afs_put_addrlist(alist, afs_alist_trace_put_parse_error); return ERR_PTR(ret); } /* * Build a VL server list from a DNS queried server list. */ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, const void *buffer, size_t buffer_size) { const struct dns_server_list_v1_header *hdr = buffer; struct dns_server_list_v1_server bs; struct afs_vlserver_list *vllist, *previous; struct afs_addr_list *addrs; struct afs_vlserver *server; const u8 *b = buffer, *end = buffer + buffer_size; int ret = -ENOMEM, nr_servers, i, j; _enter(""); /* Check that it's a server list, v1 */ if (end - b < sizeof(*hdr) || hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST || hdr->hdr.version != 1) { pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n", hdr->hdr.content, hdr->hdr.version, end - b); ret = -EDESTADDRREQ; goto dump; } nr_servers = hdr->nr_servers; vllist = afs_alloc_vlserver_list(nr_servers); if (!vllist) return ERR_PTR(-ENOMEM); vllist->source = (hdr->source < NR__dns_record_source) ? hdr->source : NR__dns_record_source; vllist->status = (hdr->status < NR__dns_lookup_status) ? hdr->status : NR__dns_lookup_status; read_lock(&cell->vl_servers_lock); previous = afs_get_vlserverlist( rcu_dereference_protected(cell->vl_servers, lockdep_is_held(&cell->vl_servers_lock))); read_unlock(&cell->vl_servers_lock); b += sizeof(*hdr); while (end - b >= sizeof(bs)) { bs.name_len = afs_extract_le16(&b); bs.priority = afs_extract_le16(&b); bs.weight = afs_extract_le16(&b); bs.port = afs_extract_le16(&b); bs.source = *b++; bs.status = *b++; bs.protocol = *b++; bs.nr_addrs = *b++; _debug("extract %u %u %u %u %u %u %*.*s", bs.name_len, bs.priority, bs.weight, bs.port, bs.protocol, bs.nr_addrs, bs.name_len, bs.name_len, b); if (end - b < bs.name_len) break; ret = -EPROTONOSUPPORT; if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) { bs.protocol = DNS_SERVER_PROTOCOL_UDP; } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) { _leave(" = [proto %u]", bs.protocol); goto error; } if (bs.port == 0) bs.port = AFS_VL_PORT; if (bs.source > NR__dns_record_source) bs.source = NR__dns_record_source; if (bs.status > NR__dns_lookup_status) bs.status = NR__dns_lookup_status; /* See if we can update an old server record */ server = NULL; for (i = 0; i < previous->nr_servers; i++) { struct afs_vlserver *p = previous->servers[i].server; if (p->name_len == bs.name_len && p->port == bs.port && strncasecmp(b, p->name, bs.name_len) == 0) { server = afs_get_vlserver(p); break; } } if (!server) { ret = -ENOMEM; server = afs_alloc_vlserver(b, bs.name_len, bs.port); if (!server) goto error; } b += bs.name_len; /* Extract the addresses - note that we can't skip this as we * have to advance the payload pointer. */ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); if (IS_ERR(addrs)) { ret = PTR_ERR(addrs); goto error_2; } if (vllist->nr_servers >= nr_servers) { _debug("skip %u >= %u", vllist->nr_servers, nr_servers); afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); afs_put_vlserver(cell->net, server); continue; } addrs->source = bs.source; addrs->status = bs.status; if (addrs->nr_addrs == 0) { afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); if (!rcu_access_pointer(server->addresses)) { afs_put_vlserver(cell->net, server); continue; } } else { struct afs_addr_list *old = addrs; write_lock(&server->lock); old = rcu_replace_pointer(server->addresses, old, lockdep_is_held(&server->lock)); write_unlock(&server->lock); afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); } /* TODO: Might want to check for duplicates */ /* Insertion-sort by priority and weight */ for (j = 0; j < vllist->nr_servers; j++) { if (bs.priority < vllist->servers[j].priority) break; /* Lower preferable */ if (bs.priority == vllist->servers[j].priority && bs.weight > vllist->servers[j].weight) break; /* Higher preferable */ } if (j < vllist->nr_servers) { memmove(vllist->servers + j + 1, vllist->servers + j, (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry)); } clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags); vllist->servers[j].priority = bs.priority; vllist->servers[j].weight = bs.weight; vllist->servers[j].server = server; vllist->nr_servers++; } if (b != end) { _debug("parse error %zd", b - end); goto error; } afs_put_vlserverlist(cell->net, previous); _leave(" = ok [%u]", vllist->nr_servers); return vllist; error_2: afs_put_vlserver(cell->net, server); error: afs_put_vlserverlist(cell->net, vllist); afs_put_vlserverlist(cell->net, previous); dump: if (ret != -ENOMEM) { printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer); print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size); } return ERR_PTR(ret); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 // SPDX-License-Identifier: GPL-2.0-or-later /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: unknown = true; } if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else if (unknown) { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } /* called with rtnl held */ static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); return false; } return true; } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", __func__); return NULL; } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ skb_set_owner_w(skb, sk); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao); in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce) { int inc_opt = dev->addr_len; struct sk_buff *skb; struct nd_msg *msg; int optlen = 0; if (!saddr) return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } return skb; } EXPORT_SYMBOL(ndisc_ns_create); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct in6_addr addr_buf; struct sk_buff *skb; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } skb = ndisc_ns_create(dev, solicit, saddr, nonce); if (skb) ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); int is_router = -1; SKB_DR(reason); u64 nonce = 0; bool inc; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return reason; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return reason; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return reason; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (READ_ONCE(idev->cnf.forwarding) && (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) || READ_ONCE(idev->cnf.proxy_ndp)) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else { SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; } } if (is_router < 0) is_router = READ_ONCE(idev->cnf.forwarding); if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); reason = SKB_CONSUMED; } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); switch (READ_ONCE(idev->cnf.accept_untracked_na)) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ return 1; case 2: /* Create new entries from untracked na only if saddr is in the * same subnet as an address configured on the interface that * received the na */ return !!ipv6_chk_prefix(saddr, dev); default: return 0; } } static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; SKB_DR(reason); u8 new_state; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return reason; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && READ_ONCE(idev->cnf.drop_unsolicited_na)) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); /* RFC 9131 updates original Neighbour Discovery RFC 4861. * NAs with Target LL Address option without a corresponding * entry in the neighbour cache can now create a STALE neighbour * cache entry on routers. * * entry accept fwding solicited behaviour * ------- ------ ------ --------- ---------------------- * present X X 0 Set state to STALE * present X X 1 Set state to REACHABLE * absent 0 X X Do nothing * absent 1 0 X Do nothing * absent 1 1 X Add a new STALE entry * * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; } } if (neigh && !IS_ERR(neigh)) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, new_state, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } reason = SKB_CONSUMED; out: neigh_release(neigh); } return reason; } static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return reason; } /* Don't accept RS if we're not in router mode */ if (!READ_ONCE(idev->cnf.forwarding)) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); reason = SKB_CONSUMED; } out: return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; struct net *net; SKB_DR(reason); int lifetime; int optlen; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return reason; } if (optlen < 0) return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref)) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } } /* Set default route metric as specified by user */ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; /* delete the route if lifetime is 0 or if metric needs change */ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref, defrtr_usr_metric, lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { struct nl_info nlinfo = { .nl_net = net, }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); fib6_set_expires(rt, jiffies + (HZ * lifetime)); fib6_add_gc_list(rt); spin_unlock_bh(&table->tb6_lock); } if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 && ra_msg->icmph.icmp6_hop_limit) { if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <= ra_msg->icmph.icmp6_hop_limit) { WRITE_ONCE(in6_dev->cnf.hop_limit, ra_msg->icmph.icmp6_hop_limit); fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/100) rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) continue; if (ri->lifetime != 0 && ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) continue; if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen)) continue; if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen)) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (in6_dev->ra_mtu != mtu) { in6_dev->ra_mtu = mtu; send_ifinfo_notify = true; } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or * timer settings or ra_mtu value */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); fib6_info_release(rt); if (neigh) neigh_release(neigh); return reason; } static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); struct ndisc_options ndopts; SKB_DR(reason); u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return SKB_DROP_REASON_PKT_TOO_SMALL; return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); rcu_read_unlock(); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { enum skb_drop_reason reason = ndisc_recv_ns(skb); kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) { return ipv6_addr_is_multicast((struct in6_addr *)pkey); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && READ_ONCE(idev->cnf.suppress_frag_ndisc)) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: reason = ndisc_redirect_rcv(skb); break; } return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); fallthrough; case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (READ_ONCE(idev->cnf.ndisc_notify) || READ_ONCE(net->ipv6.devconf_all->ndisc_notify)) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: idev = in6_dev_get(dev); if (!idev) evict_nocarrier = true; else { evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) && READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier); in6_dev_put(idev); } change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ inet6_clear_bit(MC6_LOOP, sk); return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); }
3 74 13 77 132 130 88 130 5 98 5 5 36 35 100 98 98 2 74 3 3 2 31 68 115 112 49 63 8 27 35 69 64 51 43 50 68 61 3 2 65 7 5 56 53 56 55 55 69 69 69 68 6 49 14 14 62 5 54 44 56 27 40 44 13 39 15 53 2 51 24 13 11 24 69 1 68 72 71 71 71 68 66 6 26 26 50 1 61 41 1 31 1 69 68 6 77 74 75 78 78 71 16 50 8 84 79 77 31 5 49 49 9 40 50 89 2 7 1 80 8 89 1 8 80 79 82 1 2 2 1 33 2 30 29 2 62 21 1 38 2 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; mask = select_poll_one(i, wait, in, out, bit, busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask, filter; if (fd < 0) return 0; CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); pfd->revents = mangle_poll(mask); if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * V4L2 controls support header. * * Copyright (C) 2010 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_CTRLS_H #define _V4L2_CTRLS_H #include <linux/list.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <media/media-request.h> /* forward references */ struct file; struct poll_table_struct; struct v4l2_ctrl; struct v4l2_ctrl_handler; struct v4l2_ctrl_helper; struct v4l2_fh; struct v4l2_fwnode_device_properties; struct v4l2_subdev; struct v4l2_subscribed_event; struct video_device; /** * union v4l2_ctrl_ptr - A pointer to a control value. * @p_s32: Pointer to a 32-bit signed value. * @p_s64: Pointer to a 64-bit signed value. * @p_u8: Pointer to a 8-bit unsigned value. * @p_u16: Pointer to a 16-bit unsigned value. * @p_u32: Pointer to a 32-bit unsigned value. * @p_char: Pointer to a string. * @p_mpeg2_sequence: Pointer to a MPEG2 sequence structure. * @p_mpeg2_picture: Pointer to a MPEG2 picture structure. * @p_mpeg2_quantisation: Pointer to a MPEG2 quantisation data structure. * @p_fwht_params: Pointer to a FWHT stateless parameters structure. * @p_h264_sps: Pointer to a struct v4l2_ctrl_h264_sps. * @p_h264_pps: Pointer to a struct v4l2_ctrl_h264_pps. * @p_h264_scaling_matrix: Pointer to a struct v4l2_ctrl_h264_scaling_matrix. * @p_h264_slice_params: Pointer to a struct v4l2_ctrl_h264_slice_params. * @p_h264_decode_params: Pointer to a struct v4l2_ctrl_h264_decode_params. * @p_h264_pred_weights: Pointer to a struct v4l2_ctrl_h264_pred_weights. * @p_vp8_frame: Pointer to a VP8 frame params structure. * @p_vp9_compressed_hdr_probs: Pointer to a VP9 frame compressed header probs structure. * @p_vp9_frame: Pointer to a VP9 frame params structure. * @p_hevc_sps: Pointer to an HEVC sequence parameter set structure. * @p_hevc_pps: Pointer to an HEVC picture parameter set structure. * @p_hevc_slice_params: Pointer to an HEVC slice parameters structure. * @p_hdr10_cll: Pointer to an HDR10 Content Light Level structure. * @p_hdr10_mastering: Pointer to an HDR10 Mastering Display structure. * @p_area: Pointer to an area. * @p_av1_sequence: Pointer to an AV1 sequence structure. * @p_av1_tile_group_entry: Pointer to an AV1 tile group entry structure. * @p_av1_frame: Pointer to an AV1 frame structure. * @p_av1_film_grain: Pointer to an AV1 film grain structure. * @p: Pointer to a compound value. * @p_const: Pointer to a constant compound value. */ union v4l2_ctrl_ptr { s32 *p_s32; s64 *p_s64; u8 *p_u8; u16 *p_u16; u32 *p_u32; char *p_char; struct v4l2_ctrl_mpeg2_sequence *p_mpeg2_sequence; struct v4l2_ctrl_mpeg2_picture *p_mpeg2_picture; struct v4l2_ctrl_mpeg2_quantisation *p_mpeg2_quantisation; struct v4l2_ctrl_fwht_params *p_fwht_params; struct v4l2_ctrl_h264_sps *p_h264_sps; struct v4l2_ctrl_h264_pps *p_h264_pps; struct v4l2_ctrl_h264_scaling_matrix *p_h264_scaling_matrix; struct v4l2_ctrl_h264_slice_params *p_h264_slice_params; struct v4l2_ctrl_h264_decode_params *p_h264_decode_params; struct v4l2_ctrl_h264_pred_weights *p_h264_pred_weights; struct v4l2_ctrl_vp8_frame *p_vp8_frame; struct v4l2_ctrl_hevc_sps *p_hevc_sps; struct v4l2_ctrl_hevc_pps *p_hevc_pps; struct v4l2_ctrl_hevc_slice_params *p_hevc_slice_params; struct v4l2_ctrl_vp9_compressed_hdr *p_vp9_compressed_hdr_probs; struct v4l2_ctrl_vp9_frame *p_vp9_frame; struct v4l2_ctrl_hdr10_cll_info *p_hdr10_cll; struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering; struct v4l2_area *p_area; struct v4l2_ctrl_av1_sequence *p_av1_sequence; struct v4l2_ctrl_av1_tile_group_entry *p_av1_tile_group_entry; struct v4l2_ctrl_av1_frame *p_av1_frame; struct v4l2_ctrl_av1_film_grain *p_av1_film_grain; void *p; const void *p_const; }; /** * v4l2_ctrl_ptr_create() - Helper function to return a v4l2_ctrl_ptr from a * void pointer * @ptr: The void pointer */ static inline union v4l2_ctrl_ptr v4l2_ctrl_ptr_create(void *ptr) { union v4l2_ctrl_ptr p = { .p = ptr }; return p; } /** * struct v4l2_ctrl_ops - The control operations that the driver has to provide. * * @g_volatile_ctrl: Get a new value for this control. Generally only relevant * for volatile (and usually read-only) controls such as a control * that returns the current signal strength which changes * continuously. * If not set, then the currently cached value will be returned. * @try_ctrl: Test whether the control's value is valid. Only relevant when * the usual min/max/step checks are not sufficient. * @s_ctrl: Actually set the new control value. s_ctrl is compulsory. The * ctrl->handler->lock is held when these ops are called, so no * one else can access controls owned by that handler. */ struct v4l2_ctrl_ops { int (*g_volatile_ctrl)(struct v4l2_ctrl *ctrl); int (*try_ctrl)(struct v4l2_ctrl *ctrl); int (*s_ctrl)(struct v4l2_ctrl *ctrl); }; /** * struct v4l2_ctrl_type_ops - The control type operations that the driver * has to provide. * * @equal: return true if all ctrl->elems array elements are equal. * @init: initialize the value for array elements from from_idx to ctrl->elems. * @log: log the value. * @validate: validate the value for ctrl->new_elems array elements. * Return 0 on success and a negative value otherwise. */ struct v4l2_ctrl_type_ops { bool (*equal)(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2); void (*init)(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr); void (*log)(const struct v4l2_ctrl *ctrl); int (*validate)(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr); }; /** * typedef v4l2_ctrl_notify_fnc - typedef for a notify argument with a function * that should be called when a control value has changed. * * @ctrl: pointer to struct &v4l2_ctrl * @priv: control private data * * This typedef definition is used as an argument to v4l2_ctrl_notify() * and as an argument at struct &v4l2_ctrl_handler. */ typedef void (*v4l2_ctrl_notify_fnc)(struct v4l2_ctrl *ctrl, void *priv); /** * struct v4l2_ctrl - The control structure. * * @node: The list node. * @ev_subs: The list of control event subscriptions. * @handler: The handler that owns the control. * @cluster: Point to start of cluster array. * @ncontrols: Number of controls in cluster array. * @done: Internal flag: set for each processed control. * @is_new: Set when the user specified a new value for this control. It * is also set when called from v4l2_ctrl_handler_setup(). Drivers * should never set this flag. * @has_changed: Set when the current value differs from the new value. Drivers * should never use this flag. * @is_private: If set, then this control is private to its handler and it * will not be added to any other handlers. Drivers can set * this flag. * @is_auto: If set, then this control selects whether the other cluster * members are in 'automatic' mode or 'manual' mode. This is * used for autogain/gain type clusters. Drivers should never * set this flag directly. * @is_int: If set, then this control has a simple integer value (i.e. it * uses ctrl->val). * @is_string: If set, then this control has type %V4L2_CTRL_TYPE_STRING. * @is_ptr: If set, then this control is an array and/or has type >= * %V4L2_CTRL_COMPOUND_TYPES * and/or has type %V4L2_CTRL_TYPE_STRING. In other words, &struct * v4l2_ext_control uses field p to point to the data. * @is_array: If set, then this control contains an N-dimensional array. * @is_dyn_array: If set, then this control contains a dynamically sized 1-dimensional array. * If this is set, then @is_array is also set. * @has_volatiles: If set, then one or more members of the cluster are volatile. * Drivers should never touch this flag. * @call_notify: If set, then call the handler's notify function whenever the * control's value changes. * @manual_mode_value: If the is_auto flag is set, then this is the value * of the auto control that determines if that control is in * manual mode. So if the value of the auto control equals this * value, then the whole cluster is in manual mode. Drivers should * never set this flag directly. * @ops: The control ops. * @type_ops: The control type ops. * @id: The control ID. * @name: The control name. * @type: The control type. * @minimum: The control's minimum value. * @maximum: The control's maximum value. * @default_value: The control's default value. * @step: The control's step value for non-menu controls. * @elems: The number of elements in the N-dimensional array. * @elem_size: The size in bytes of the control. * @new_elems: The number of elements in p_new. This is the same as @elems, * except for dynamic arrays. In that case it is in the range of * 1 to @p_array_alloc_elems. * @dims: The size of each dimension. * @nr_of_dims:The number of dimensions in @dims. * @menu_skip_mask: The control's skip mask for menu controls. This makes it * easy to skip menu items that are not valid. If bit X is set, * then menu item X is skipped. Of course, this only works for * menus with <= 32 menu items. There are no menus that come * close to that number, so this is OK. Should we ever need more, * then this will have to be extended to a u64 or a bit array. * @qmenu: A const char * array for all menu items. Array entries that are * empty strings ("") correspond to non-existing menu items (this * is in addition to the menu_skip_mask above). The last entry * must be NULL. * Used only if the @type is %V4L2_CTRL_TYPE_MENU. * @qmenu_int: A 64-bit integer array for with integer menu items. * The size of array must be equal to the menu size, e. g.: * :math:`ceil(\frac{maximum - minimum}{step}) + 1`. * Used only if the @type is %V4L2_CTRL_TYPE_INTEGER_MENU. * @flags: The control's flags. * @priv: The control's private pointer. For use by the driver. It is * untouched by the control framework. Note that this pointer is * not freed when the control is deleted. Should this be needed * then a new internal bitfield can be added to tell the framework * to free this pointer. * @p_array: Pointer to the allocated array. Only valid if @is_array is true. * @p_array_alloc_elems: The number of elements in the allocated * array for both the cur and new values. So @p_array is actually * sized for 2 * @p_array_alloc_elems * @elem_size. Only valid if * @is_array is true. * @cur: Structure to store the current value. * @cur.val: The control's current value, if the @type is represented via * a u32 integer (see &enum v4l2_ctrl_type). * @val: The control's new s32 value. * @p_def: The control's default value represented via a union which * provides a standard way of accessing control types * through a pointer (for compound controls only). * @p_cur: The control's current value represented via a union which * provides a standard way of accessing control types * through a pointer. * @p_new: The control's new value represented via a union which provides * a standard way of accessing control types * through a pointer. */ struct v4l2_ctrl { /* Administrative fields */ struct list_head node; struct list_head ev_subs; struct v4l2_ctrl_handler *handler; struct v4l2_ctrl **cluster; unsigned int ncontrols; unsigned int done:1; unsigned int is_new:1; unsigned int has_changed:1; unsigned int is_private:1; unsigned int is_auto:1; unsigned int is_int:1; unsigned int is_string:1; unsigned int is_ptr:1; unsigned int is_array:1; unsigned int is_dyn_array:1; unsigned int has_volatiles:1; unsigned int call_notify:1; unsigned int manual_mode_value:8; const struct v4l2_ctrl_ops *ops; const struct v4l2_ctrl_type_ops *type_ops; u32 id; const char *name; enum v4l2_ctrl_type type; s64 minimum, maximum, default_value; u32 elems; u32 elem_size; u32 new_elems; u32 dims[V4L2_CTRL_MAX_DIMS]; u32 nr_of_dims; union { u64 step; u64 menu_skip_mask; }; union { const char * const *qmenu; const s64 *qmenu_int; }; unsigned long flags; void *priv; void *p_array; u32 p_array_alloc_elems; s32 val; struct { s32 val; } cur; union v4l2_ctrl_ptr p_def; union v4l2_ctrl_ptr p_new; union v4l2_ctrl_ptr p_cur; }; /** * struct v4l2_ctrl_ref - The control reference. * * @node: List node for the sorted list. * @next: Single-link list node for the hash. * @ctrl: The actual control information. * @helper: Pointer to helper struct. Used internally in * ``prepare_ext_ctrls`` function at ``v4l2-ctrl.c``. * @from_other_dev: If true, then @ctrl was defined in another * device than the &struct v4l2_ctrl_handler. * @req_done: Internal flag: if the control handler containing this control * reference is bound to a media request, then this is set when * the control has been applied. This prevents applying controls * from a cluster with multiple controls twice (when the first * control of a cluster is applied, they all are). * @p_req_valid: If set, then p_req contains the control value for the request. * @p_req_array_enomem: If set, then p_req is invalid since allocating space for * an array failed. Attempting to read this value shall * result in ENOMEM. Only valid if ctrl->is_array is true. * @p_req_array_alloc_elems: The number of elements allocated for the * array. Only valid if @p_req_valid and ctrl->is_array are * true. * @p_req_elems: The number of elements in @p_req. This is the same as * ctrl->elems, except for dynamic arrays. In that case it is in * the range of 1 to @p_req_array_alloc_elems. Only valid if * @p_req_valid is true. * @p_req: If the control handler containing this control reference * is bound to a media request, then this points to the * value of the control that must be applied when the request * is executed, or to the value of the control at the time * that the request was completed. If @p_req_valid is false, * then this control was never set for this request and the * control will not be updated when this request is applied. * * Each control handler has a list of these refs. The list_head is used to * keep a sorted-by-control-ID list of all controls, while the next pointer * is used to link the control in the hash's bucket. */ struct v4l2_ctrl_ref { struct list_head node; struct v4l2_ctrl_ref *next; struct v4l2_ctrl *ctrl; struct v4l2_ctrl_helper *helper; bool from_other_dev; bool req_done; bool p_req_valid; bool p_req_array_enomem; u32 p_req_array_alloc_elems; u32 p_req_elems; union v4l2_ctrl_ptr p_req; }; /** * struct v4l2_ctrl_handler - The control handler keeps track of all the * controls: both the controls owned by the handler and those inherited * from other handlers. * * @_lock: Default for "lock". * @lock: Lock to control access to this handler and its controls. * May be replaced by the user right after init. * @ctrls: The list of controls owned by this handler. * @ctrl_refs: The list of control references. * @cached: The last found control reference. It is common that the same * control is needed multiple times, so this is a simple * optimization. * @buckets: Buckets for the hashing. Allows for quick control lookup. * @notify: A notify callback that is called whenever the control changes * value. * Note that the handler's lock is held when the notify function * is called! * @notify_priv: Passed as argument to the v4l2_ctrl notify callback. * @nr_of_buckets: Total number of buckets in the array. * @error: The error code of the first failed control addition. * @request_is_queued: True if the request was queued. * @requests: List to keep track of open control handler request objects. * For the parent control handler (@req_obj.ops == NULL) this * is the list header. When the parent control handler is * removed, it has to unbind and put all these requests since * they refer to the parent. * @requests_queued: List of the queued requests. This determines the order * in which these controls are applied. Once the request is * completed it is removed from this list. * @req_obj: The &struct media_request_object, used to link into a * &struct media_request. This request object has a refcount. */ struct v4l2_ctrl_handler { struct mutex _lock; struct mutex *lock; struct list_head ctrls; struct list_head ctrl_refs; struct v4l2_ctrl_ref *cached; struct v4l2_ctrl_ref **buckets; v4l2_ctrl_notify_fnc notify; void *notify_priv; u16 nr_of_buckets; int error; bool request_is_queued; struct list_head requests; struct list_head requests_queued; struct media_request_object req_obj; }; /** * struct v4l2_ctrl_config - Control configuration structure. * * @ops: The control ops. * @type_ops: The control type ops. Only needed for compound controls. * @id: The control ID. * @name: The control name. * @type: The control type. * @min: The control's minimum value. * @max: The control's maximum value. * @step: The control's step value for non-menu controls. * @def: The control's default value. * @p_def: The control's default value for compound controls. * @dims: The size of each dimension. * @elem_size: The size in bytes of the control. * @flags: The control's flags. * @menu_skip_mask: The control's skip mask for menu controls. This makes it * easy to skip menu items that are not valid. If bit X is set, * then menu item X is skipped. Of course, this only works for * menus with <= 64 menu items. There are no menus that come * close to that number, so this is OK. Should we ever need more, * then this will have to be extended to a bit array. * @qmenu: A const char * array for all menu items. Array entries that are * empty strings ("") correspond to non-existing menu items (this * is in addition to the menu_skip_mask above). The last entry * must be NULL. * @qmenu_int: A const s64 integer array for all menu items of the type * V4L2_CTRL_TYPE_INTEGER_MENU. * @is_private: If set, then this control is private to its handler and it * will not be added to any other handlers. */ struct v4l2_ctrl_config { const struct v4l2_ctrl_ops *ops; const struct v4l2_ctrl_type_ops *type_ops; u32 id; const char *name; enum v4l2_ctrl_type type; s64 min; s64 max; u64 step; s64 def; union v4l2_ctrl_ptr p_def; u32 dims[V4L2_CTRL_MAX_DIMS]; u32 elem_size; u32 flags; u64 menu_skip_mask; const char * const *qmenu; const s64 *qmenu_int; unsigned int is_private:1; }; /** * v4l2_ctrl_fill - Fill in the control fields based on the control ID. * * @id: ID of the control * @name: pointer to be filled with a string with the name of the control * @type: pointer for storing the type of the control * @min: pointer for storing the minimum value for the control * @max: pointer for storing the maximum value for the control * @step: pointer for storing the control step * @def: pointer for storing the default value for the control * @flags: pointer for storing the flags to be used on the control * * This works for all standard V4L2 controls. * For non-standard controls it will only fill in the given arguments * and @name content will be set to %NULL. * * This function will overwrite the contents of @name, @type and @flags. * The contents of @min, @max, @step and @def may be modified depending on * the type. * * .. note:: * * Do not use in drivers! It is used internally for backwards compatibility * control handling only. Once all drivers are converted to use the new * control framework this function will no longer be exported. */ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, s64 *min, s64 *max, u64 *step, s64 *def, u32 *flags); /** * v4l2_ctrl_handler_init_class() - Initialize the control handler. * @hdl: The control handler. * @nr_of_controls_hint: A hint of how many controls this handler is * expected to refer to. This is the total number, so including * any inherited controls. It doesn't have to be precise, but if * it is way off, then you either waste memory (too many buckets * are allocated) or the control lookup becomes slower (not enough * buckets are allocated, so there are more slow list lookups). * It will always work, though. * @key: Used by the lock validator if CONFIG_LOCKDEP is set. * @name: Used by the lock validator if CONFIG_LOCKDEP is set. * * .. attention:: * * Never use this call directly, always use the v4l2_ctrl_handler_init() * macro that hides the @key and @name arguments. * * Return: returns an error if the buckets could not be allocated. This * error will also be stored in @hdl->error. */ int v4l2_ctrl_handler_init_class(struct v4l2_ctrl_handler *hdl, unsigned int nr_of_controls_hint, struct lock_class_key *key, const char *name); #ifdef CONFIG_LOCKDEP /** * v4l2_ctrl_handler_init - helper function to create a static struct * &lock_class_key and calls v4l2_ctrl_handler_init_class() * * @hdl: The control handler. * @nr_of_controls_hint: A hint of how many controls this handler is * expected to refer to. This is the total number, so including * any inherited controls. It doesn't have to be precise, but if * it is way off, then you either waste memory (too many buckets * are allocated) or the control lookup becomes slower (not enough * buckets are allocated, so there are more slow list lookups). * It will always work, though. * * This helper function creates a static struct &lock_class_key and * calls v4l2_ctrl_handler_init_class(), providing a proper name for the lock * validador. * * Use this helper function to initialize a control handler. */ #define v4l2_ctrl_handler_init(hdl, nr_of_controls_hint) \ ( \ ({ \ static struct lock_class_key _key; \ v4l2_ctrl_handler_init_class(hdl, nr_of_controls_hint, \ &_key, \ KBUILD_BASENAME ":" \ __stringify(__LINE__) ":" \ "(" #hdl ")->_lock"); \ }) \ ) #else #define v4l2_ctrl_handler_init(hdl, nr_of_controls_hint) \ v4l2_ctrl_handler_init_class(hdl, nr_of_controls_hint, NULL, NULL) #endif /** * v4l2_ctrl_handler_free() - Free all controls owned by the handler and free * the control list. * @hdl: The control handler. * * Does nothing if @hdl == NULL. */ void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl); /** * v4l2_ctrl_lock() - Helper function to lock the handler * associated with the control. * @ctrl: The control to lock. */ static inline void v4l2_ctrl_lock(struct v4l2_ctrl *ctrl) { mutex_lock(ctrl->handler->lock); } /** * v4l2_ctrl_unlock() - Helper function to unlock the handler * associated with the control. * @ctrl: The control to unlock. */ static inline void v4l2_ctrl_unlock(struct v4l2_ctrl *ctrl) { mutex_unlock(ctrl->handler->lock); } /** * __v4l2_ctrl_handler_setup() - Call the s_ctrl op for all controls belonging * to the handler to initialize the hardware to the current control values. The * caller is responsible for acquiring the control handler mutex on behalf of * __v4l2_ctrl_handler_setup(). * @hdl: The control handler. * * Button controls will be skipped, as are read-only controls. * * If @hdl == NULL, then this just returns 0. */ int __v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl); /** * v4l2_ctrl_handler_setup() - Call the s_ctrl op for all controls belonging * to the handler to initialize the hardware to the current control values. * @hdl: The control handler. * * Button controls will be skipped, as are read-only controls. * * If @hdl == NULL, then this just returns 0. */ int v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl); /** * v4l2_ctrl_handler_log_status() - Log all controls owned by the handler. * @hdl: The control handler. * @prefix: The prefix to use when logging the control values. If the * prefix does not end with a space, then ": " will be added * after the prefix. If @prefix == NULL, then no prefix will be * used. * * For use with VIDIOC_LOG_STATUS. * * Does nothing if @hdl == NULL. */ void v4l2_ctrl_handler_log_status(struct v4l2_ctrl_handler *hdl, const char *prefix); /** * v4l2_ctrl_new_custom() - Allocate and initialize a new custom V4L2 * control. * * @hdl: The control handler. * @cfg: The control's configuration data. * @priv: The control's driver-specific private data. * * If the &v4l2_ctrl struct could not be allocated then NULL is returned * and @hdl->error is set to the error code (if it wasn't set already). */ struct v4l2_ctrl *v4l2_ctrl_new_custom(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_config *cfg, void *priv); /** * v4l2_ctrl_new_std() - Allocate and initialize a new standard V4L2 non-menu * control. * * @hdl: The control handler. * @ops: The control ops. * @id: The control ID. * @min: The control's minimum value. * @max: The control's maximum value. * @step: The control's step value * @def: The control's default value. * * If the &v4l2_ctrl struct could not be allocated, or the control * ID is not known, then NULL is returned and @hdl->error is set to the * appropriate error code (if it wasn't set already). * * If @id refers to a menu control, then this function will return NULL. * * Use v4l2_ctrl_new_std_menu() when adding menu controls. */ struct v4l2_ctrl *v4l2_ctrl_new_std(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, s64 min, s64 max, u64 step, s64 def); /** * v4l2_ctrl_new_std_menu() - Allocate and initialize a new standard V4L2 * menu control. * * @hdl: The control handler. * @ops: The control ops. * @id: The control ID. * @max: The control's maximum value. * @mask: The control's skip mask for menu controls. This makes it * easy to skip menu items that are not valid. If bit X is set, * then menu item X is skipped. Of course, this only works for * menus with <= 64 menu items. There are no menus that come * close to that number, so this is OK. Should we ever need more, * then this will have to be extended to a bit array. * @def: The control's default value. * * Same as v4l2_ctrl_new_std(), but @min is set to 0 and the @mask value * determines which menu items are to be skipped. * * If @id refers to a non-menu control, then this function will return NULL. */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 max, u64 mask, u8 def); /** * v4l2_ctrl_new_std_menu_items() - Create a new standard V4L2 menu control * with driver specific menu. * * @hdl: The control handler. * @ops: The control ops. * @id: The control ID. * @max: The control's maximum value. * @mask: The control's skip mask for menu controls. This makes it * easy to skip menu items that are not valid. If bit X is set, * then menu item X is skipped. Of course, this only works for * menus with <= 64 menu items. There are no menus that come * close to that number, so this is OK. Should we ever need more, * then this will have to be extended to a bit array. * @def: The control's default value. * @qmenu: The new menu. * * Same as v4l2_ctrl_new_std_menu(), but @qmenu will be the driver specific * menu of this control. * */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 max, u64 mask, u8 def, const char * const *qmenu); /** * v4l2_ctrl_new_std_compound() - Allocate and initialize a new standard V4L2 * compound control. * * @hdl: The control handler. * @ops: The control ops. * @id: The control ID. * @p_def: The control's default value. * * Sames as v4l2_ctrl_new_std(), but with support to compound controls, thanks * to the @p_def field. Use v4l2_ctrl_ptr_create() to create @p_def from a * pointer. Use v4l2_ctrl_ptr_create(NULL) if the default value of the * compound control should be all zeroes. * */ struct v4l2_ctrl *v4l2_ctrl_new_std_compound(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, const union v4l2_ctrl_ptr p_def); /** * v4l2_ctrl_new_int_menu() - Create a new standard V4L2 integer menu control. * * @hdl: The control handler. * @ops: The control ops. * @id: The control ID. * @max: The control's maximum value. * @def: The control's default value. * @qmenu_int: The control's menu entries. * * Same as v4l2_ctrl_new_std_menu(), but @mask is set to 0 and it additionally * takes as an argument an array of integers determining the menu items. * * If @id refers to a non-integer-menu control, then this function will * return %NULL. */ struct v4l2_ctrl *v4l2_ctrl_new_int_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 max, u8 def, const s64 *qmenu_int); /** * typedef v4l2_ctrl_filter - Typedef to define the filter function to be * used when adding a control handler. * * @ctrl: pointer to struct &v4l2_ctrl. */ typedef bool (*v4l2_ctrl_filter)(const struct v4l2_ctrl *ctrl); /** * v4l2_ctrl_add_handler() - Add all controls from handler @add to * handler @hdl. * * @hdl: The control handler. * @add: The control handler whose controls you want to add to * the @hdl control handler. * @filter: This function will filter which controls should be added. * @from_other_dev: If true, then the controls in @add were defined in another * device than @hdl. * * Does nothing if either of the two handlers is a NULL pointer. * If @filter is NULL, then all controls are added. Otherwise only those * controls for which @filter returns true will be added. * In case of an error @hdl->error will be set to the error code (if it * wasn't set already). */ int v4l2_ctrl_add_handler(struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl_handler *add, v4l2_ctrl_filter filter, bool from_other_dev); /** * v4l2_ctrl_radio_filter() - Standard filter for radio controls. * * @ctrl: The control that is filtered. * * This will return true for any controls that are valid for radio device * nodes. Those are all of the V4L2_CID_AUDIO_* user controls and all FM * transmitter class controls. * * This function is to be used with v4l2_ctrl_add_handler(). */ bool v4l2_ctrl_radio_filter(const struct v4l2_ctrl *ctrl); /** * v4l2_ctrl_cluster() - Mark all controls in the cluster as belonging * to that cluster. * * @ncontrols: The number of controls in this cluster. * @controls: The cluster control array of size @ncontrols. */ void v4l2_ctrl_cluster(unsigned int ncontrols, struct v4l2_ctrl **controls); /** * v4l2_ctrl_auto_cluster() - Mark all controls in the cluster as belonging * to that cluster and set it up for autofoo/foo-type handling. * * @ncontrols: The number of controls in this cluster. * @controls: The cluster control array of size @ncontrols. The first control * must be the 'auto' control (e.g. autogain, autoexposure, etc.) * @manual_val: The value for the first control in the cluster that equals the * manual setting. * @set_volatile: If true, then all controls except the first auto control will * be volatile. * * Use for control groups where one control selects some automatic feature and * the other controls are only active whenever the automatic feature is turned * off (manual mode). Typical examples: autogain vs gain, auto-whitebalance vs * red and blue balance, etc. * * The behavior of such controls is as follows: * * When the autofoo control is set to automatic, then any manual controls * are set to inactive and any reads will call g_volatile_ctrl (if the control * was marked volatile). * * When the autofoo control is set to manual, then any manual controls will * be marked active, and any reads will just return the current value without * going through g_volatile_ctrl. * * In addition, this function will set the %V4L2_CTRL_FLAG_UPDATE flag * on the autofoo control and %V4L2_CTRL_FLAG_INACTIVE on the foo control(s) * if autofoo is in auto mode. */ void v4l2_ctrl_auto_cluster(unsigned int ncontrols, struct v4l2_ctrl **controls, u8 manual_val, bool set_volatile); /** * v4l2_ctrl_find() - Find a control with the given ID. * * @hdl: The control handler. * @id: The control ID to find. * * If @hdl == NULL this will return NULL as well. Will lock the handler so * do not use from inside &v4l2_ctrl_ops. */ struct v4l2_ctrl *v4l2_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id); /** * v4l2_ctrl_activate() - Make the control active or inactive. * @ctrl: The control to (de)activate. * @active: True if the control should become active. * * This sets or clears the V4L2_CTRL_FLAG_INACTIVE flag atomically. * Does nothing if @ctrl == NULL. * This will usually be called from within the s_ctrl op. * The V4L2_EVENT_CTRL event will be generated afterwards. * * This function assumes that the control handler is locked. */ void v4l2_ctrl_activate(struct v4l2_ctrl *ctrl, bool active); /** * __v4l2_ctrl_grab() - Unlocked variant of v4l2_ctrl_grab. * * @ctrl: The control to (de)activate. * @grabbed: True if the control should become grabbed. * * This sets or clears the V4L2_CTRL_FLAG_GRABBED flag atomically. * Does nothing if @ctrl == NULL. * The V4L2_EVENT_CTRL event will be generated afterwards. * This will usually be called when starting or stopping streaming in the * driver. * * This function assumes that the control handler is locked by the caller. */ void __v4l2_ctrl_grab(struct v4l2_ctrl *ctrl, bool grabbed); /** * v4l2_ctrl_grab() - Mark the control as grabbed or not grabbed. * * @ctrl: The control to (de)activate. * @grabbed: True if the control should become grabbed. * * This sets or clears the V4L2_CTRL_FLAG_GRABBED flag atomically. * Does nothing if @ctrl == NULL. * The V4L2_EVENT_CTRL event will be generated afterwards. * This will usually be called when starting or stopping streaming in the * driver. * * This function assumes that the control handler is not locked and will * take the lock itself. */ static inline void v4l2_ctrl_grab(struct v4l2_ctrl *ctrl, bool grabbed) { if (!ctrl) return; v4l2_ctrl_lock(ctrl); __v4l2_ctrl_grab(ctrl, grabbed); v4l2_ctrl_unlock(ctrl); } /** *__v4l2_ctrl_modify_range() - Unlocked variant of v4l2_ctrl_modify_range() * * @ctrl: The control to update. * @min: The control's minimum value. * @max: The control's maximum value. * @step: The control's step value * @def: The control's default value. * * Update the range of a control on the fly. This works for control types * INTEGER, BOOLEAN, MENU, INTEGER MENU and BITMASK. For menu controls the * @step value is interpreted as a menu_skip_mask. * * An error is returned if one of the range arguments is invalid for this * control type. * * The caller is responsible for acquiring the control handler mutex on behalf * of __v4l2_ctrl_modify_range(). */ int __v4l2_ctrl_modify_range(struct v4l2_ctrl *ctrl, s64 min, s64 max, u64 step, s64 def); /** * v4l2_ctrl_modify_range() - Update the range of a control. * * @ctrl: The control to update. * @min: The control's minimum value. * @max: The control's maximum value. * @step: The control's step value * @def: The control's default value. * * Update the range of a control on the fly. This works for control types * INTEGER, BOOLEAN, MENU, INTEGER MENU and BITMASK. For menu controls the * @step value is interpreted as a menu_skip_mask. * * An error is returned if one of the range arguments is invalid for this * control type. * * This function assumes that the control handler is not locked and will * take the lock itself. */ static inline int v4l2_ctrl_modify_range(struct v4l2_ctrl *ctrl, s64 min, s64 max, u64 step, s64 def) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_modify_range(ctrl, min, max, step, def); v4l2_ctrl_unlock(ctrl); return rval; } /** *__v4l2_ctrl_modify_dimensions() - Unlocked variant of v4l2_ctrl_modify_dimensions() * * @ctrl: The control to update. * @dims: The control's new dimensions. * * Update the dimensions of an array control on the fly. The elements of the * array are reset to their default value, even if the dimensions are * unchanged. * * An error is returned if @dims is invalid for this control. * * The caller is responsible for acquiring the control handler mutex on behalf * of __v4l2_ctrl_modify_dimensions(). * * Note: calling this function when the same control is used in pending requests * is untested. It should work (a request with the wrong size of the control * will drop that control silently), but it will be very confusing. */ int __v4l2_ctrl_modify_dimensions(struct v4l2_ctrl *ctrl, u32 dims[V4L2_CTRL_MAX_DIMS]); /** * v4l2_ctrl_modify_dimensions() - Update the dimensions of an array control. * * @ctrl: The control to update. * @dims: The control's new dimensions. * * Update the dimensions of an array control on the fly. The elements of the * array are reset to their default value, even if the dimensions are * unchanged. * * An error is returned if @dims is invalid for this control type. * * This function assumes that the control handler is not locked and will * take the lock itself. * * Note: calling this function when the same control is used in pending requests * is untested. It should work (a request with the wrong size of the control * will drop that control silently), but it will be very confusing. */ static inline int v4l2_ctrl_modify_dimensions(struct v4l2_ctrl *ctrl, u32 dims[V4L2_CTRL_MAX_DIMS]) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_modify_dimensions(ctrl, dims); v4l2_ctrl_unlock(ctrl); return rval; } /** * v4l2_ctrl_notify() - Function to set a notify callback for a control. * * @ctrl: The control. * @notify: The callback function. * @priv: The callback private handle, passed as argument to the callback. * * This function sets a callback function for the control. If @ctrl is NULL, * then it will do nothing. If @notify is NULL, then the notify callback will * be removed. * * There can be only one notify. If another already exists, then a WARN_ON * will be issued and the function will do nothing. */ void v4l2_ctrl_notify(struct v4l2_ctrl *ctrl, v4l2_ctrl_notify_fnc notify, void *priv); /** * v4l2_ctrl_get_name() - Get the name of the control * * @id: The control ID. * * This function returns the name of the given control ID or NULL if it isn't * a known control. */ const char *v4l2_ctrl_get_name(u32 id); /** * v4l2_ctrl_get_menu() - Get the menu string array of the control * * @id: The control ID. * * This function returns the NULL-terminated menu string array name of the * given control ID or NULL if it isn't a known menu control. */ const char * const *v4l2_ctrl_get_menu(u32 id); /** * v4l2_ctrl_get_int_menu() - Get the integer menu array of the control * * @id: The control ID. * @len: The size of the integer array. * * This function returns the integer array of the given control ID or NULL if it * if it isn't a known integer menu control. */ const s64 *v4l2_ctrl_get_int_menu(u32 id, u32 *len); /** * v4l2_ctrl_g_ctrl() - Helper function to get the control's value from * within a driver. * * @ctrl: The control. * * This returns the control's value safely by going through the control * framework. This function will lock the control's handler, so it cannot be * used from within the &v4l2_ctrl_ops functions. * * This function is for integer type controls only. */ s32 v4l2_ctrl_g_ctrl(struct v4l2_ctrl *ctrl); /** * __v4l2_ctrl_s_ctrl() - Unlocked variant of v4l2_ctrl_s_ctrl(). * * @ctrl: The control. * @val: The new value. * * This sets the control's new value safely by going through the control * framework. This function assumes the control's handler is already locked, * allowing it to be used from within the &v4l2_ctrl_ops functions. * * This function is for integer type controls only. */ int __v4l2_ctrl_s_ctrl(struct v4l2_ctrl *ctrl, s32 val); /** * v4l2_ctrl_s_ctrl() - Helper function to set the control's value from * within a driver. * @ctrl: The control. * @val: The new value. * * This sets the control's new value safely by going through the control * framework. This function will lock the control's handler, so it cannot be * used from within the &v4l2_ctrl_ops functions. * * This function is for integer type controls only. */ static inline int v4l2_ctrl_s_ctrl(struct v4l2_ctrl *ctrl, s32 val) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_s_ctrl(ctrl, val); v4l2_ctrl_unlock(ctrl); return rval; } /** * v4l2_ctrl_g_ctrl_int64() - Helper function to get a 64-bit control's value * from within a driver. * * @ctrl: The control. * * This returns the control's value safely by going through the control * framework. This function will lock the control's handler, so it cannot be * used from within the &v4l2_ctrl_ops functions. * * This function is for 64-bit integer type controls only. */ s64 v4l2_ctrl_g_ctrl_int64(struct v4l2_ctrl *ctrl); /** * __v4l2_ctrl_s_ctrl_int64() - Unlocked variant of v4l2_ctrl_s_ctrl_int64(). * * @ctrl: The control. * @val: The new value. * * This sets the control's new value safely by going through the control * framework. This function assumes the control's handler is already locked, * allowing it to be used from within the &v4l2_ctrl_ops functions. * * This function is for 64-bit integer type controls only. */ int __v4l2_ctrl_s_ctrl_int64(struct v4l2_ctrl *ctrl, s64 val); /** * v4l2_ctrl_s_ctrl_int64() - Helper function to set a 64-bit control's value * from within a driver. * * @ctrl: The control. * @val: The new value. * * This sets the control's new value safely by going through the control * framework. This function will lock the control's handler, so it cannot be * used from within the &v4l2_ctrl_ops functions. * * This function is for 64-bit integer type controls only. */ static inline int v4l2_ctrl_s_ctrl_int64(struct v4l2_ctrl *ctrl, s64 val) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_s_ctrl_int64(ctrl, val); v4l2_ctrl_unlock(ctrl); return rval; } /** * __v4l2_ctrl_s_ctrl_string() - Unlocked variant of v4l2_ctrl_s_ctrl_string(). * * @ctrl: The control. * @s: The new string. * * This sets the control's new string safely by going through the control * framework. This function assumes the control's handler is already locked, * allowing it to be used from within the &v4l2_ctrl_ops functions. * * This function is for string type controls only. */ int __v4l2_ctrl_s_ctrl_string(struct v4l2_ctrl *ctrl, const char *s); /** * v4l2_ctrl_s_ctrl_string() - Helper function to set a control's string value * from within a driver. * * @ctrl: The control. * @s: The new string. * * This sets the control's new string safely by going through the control * framework. This function will lock the control's handler, so it cannot be * used from within the &v4l2_ctrl_ops functions. * * This function is for string type controls only. */ static inline int v4l2_ctrl_s_ctrl_string(struct v4l2_ctrl *ctrl, const char *s) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_s_ctrl_string(ctrl, s); v4l2_ctrl_unlock(ctrl); return rval; } /** * __v4l2_ctrl_s_ctrl_compound() - Unlocked variant to set a compound control * * @ctrl: The control. * @type: The type of the data. * @p: The new compound payload. * * This sets the control's new compound payload safely by going through the * control framework. This function assumes the control's handler is already * locked, allowing it to be used from within the &v4l2_ctrl_ops functions. * * This function is for compound type controls only. */ int __v4l2_ctrl_s_ctrl_compound(struct v4l2_ctrl *ctrl, enum v4l2_ctrl_type type, const void *p); /** * v4l2_ctrl_s_ctrl_compound() - Helper function to set a compound control * from within a driver. * * @ctrl: The control. * @type: The type of the data. * @p: The new compound payload. * * This sets the control's new compound payload safely by going through the * control framework. This function will lock the control's handler, so it * cannot be used from within the &v4l2_ctrl_ops functions. * * This function is for compound type controls only. */ static inline int v4l2_ctrl_s_ctrl_compound(struct v4l2_ctrl *ctrl, enum v4l2_ctrl_type type, const void *p) { int rval; v4l2_ctrl_lock(ctrl); rval = __v4l2_ctrl_s_ctrl_compound(ctrl, type, p); v4l2_ctrl_unlock(ctrl); return rval; } /* Helper defines for area type controls */ #define __v4l2_ctrl_s_ctrl_area(ctrl, area) \ __v4l2_ctrl_s_ctrl_compound((ctrl), V4L2_CTRL_TYPE_AREA, (area)) #define v4l2_ctrl_s_ctrl_area(ctrl, area) \ v4l2_ctrl_s_ctrl_compound((ctrl), V4L2_CTRL_TYPE_AREA, (area)) /* Internal helper functions that deal with control events. */ extern const struct v4l2_subscribed_event_ops v4l2_ctrl_sub_ev_ops; /** * v4l2_ctrl_replace - Function to be used as a callback to * &struct v4l2_subscribed_event_ops replace\(\) * * @old: pointer to struct &v4l2_event with the reported * event; * @new: pointer to struct &v4l2_event with the modified * event; */ void v4l2_ctrl_replace(struct v4l2_event *old, const struct v4l2_event *new); /** * v4l2_ctrl_merge - Function to be used as a callback to * &struct v4l2_subscribed_event_ops merge(\) * * @old: pointer to struct &v4l2_event with the reported * event; * @new: pointer to struct &v4l2_event with the merged * event; */ void v4l2_ctrl_merge(const struct v4l2_event *old, struct v4l2_event *new); /** * v4l2_ctrl_log_status - helper function to implement %VIDIOC_LOG_STATUS ioctl * * @file: pointer to struct file * @fh: unused. Kept just to be compatible to the arguments expected by * &struct v4l2_ioctl_ops.vidioc_log_status. * * Can be used as a vidioc_log_status function that just dumps all controls * associated with the filehandle. */ int v4l2_ctrl_log_status(struct file *file, void *fh); /** * v4l2_ctrl_subscribe_event - Subscribes to an event * * * @fh: pointer to struct v4l2_fh * @sub: pointer to &struct v4l2_event_subscription * * Can be used as a vidioc_subscribe_event function that just subscribes * control events. */ int v4l2_ctrl_subscribe_event(struct v4l2_fh *fh, const struct v4l2_event_subscription *sub); /** * v4l2_ctrl_poll - function to be used as a callback to the poll() * That just polls for control events. * * @file: pointer to struct file * @wait: pointer to struct poll_table_struct */ __poll_t v4l2_ctrl_poll(struct file *file, struct poll_table_struct *wait); /** * v4l2_ctrl_request_setup - helper function to apply control values in a request * * @req: The request * @parent: The parent control handler ('priv' in media_request_object_find()) * * This is a helper function to call the control handler's s_ctrl callback with * the control values contained in the request. Do note that this approach of * applying control values in a request is only applicable to memory-to-memory * devices. */ int v4l2_ctrl_request_setup(struct media_request *req, struct v4l2_ctrl_handler *parent); /** * v4l2_ctrl_request_complete - Complete a control handler request object * * @req: The request * @parent: The parent control handler ('priv' in media_request_object_find()) * * This function is to be called on each control handler that may have had a * request object associated with it, i.e. control handlers of a driver that * supports requests. * * The function first obtains the values of any volatile controls in the control * handler and attach them to the request. Then, the function completes the * request object. */ void v4l2_ctrl_request_complete(struct media_request *req, struct v4l2_ctrl_handler *parent); /** * v4l2_ctrl_request_hdl_find - Find the control handler in the request * * @req: The request * @parent: The parent control handler ('priv' in media_request_object_find()) * * This function finds the control handler in the request. It may return * NULL if not found. When done, you must call v4l2_ctrl_request_hdl_put() * with the returned handler pointer. * * If the request is not in state VALIDATING or QUEUED, then this function * will always return NULL. * * Note that in state VALIDATING the req_queue_mutex is held, so * no objects can be added or deleted from the request. * * In state QUEUED it is the driver that will have to ensure this. */ struct v4l2_ctrl_handler *v4l2_ctrl_request_hdl_find(struct media_request *req, struct v4l2_ctrl_handler *parent); /** * v4l2_ctrl_request_hdl_put - Put the control handler * * @hdl: Put this control handler * * This function released the control handler previously obtained from' * v4l2_ctrl_request_hdl_find(). */ static inline void v4l2_ctrl_request_hdl_put(struct v4l2_ctrl_handler *hdl) { if (hdl) media_request_object_put(&hdl->req_obj); } /** * v4l2_ctrl_request_hdl_ctrl_find() - Find a control with the given ID. * * @hdl: The control handler from the request. * @id: The ID of the control to find. * * This function returns a pointer to the control if this control is * part of the request or NULL otherwise. */ struct v4l2_ctrl * v4l2_ctrl_request_hdl_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id); /* Helpers for ioctl_ops */ /** * v4l2_queryctrl - Helper function to implement * :ref:`VIDIOC_QUERYCTRL <vidioc_queryctrl>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @qc: pointer to &struct v4l2_queryctrl * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_queryctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_queryctrl *qc); /** * v4l2_query_ext_ctrl - Helper function to implement * :ref:`VIDIOC_QUERY_EXT_CTRL <vidioc_queryctrl>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @qc: pointer to &struct v4l2_query_ext_ctrl * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_query_ext_ctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_query_ext_ctrl *qc); /** * v4l2_querymenu - Helper function to implement * :ref:`VIDIOC_QUERYMENU <vidioc_queryctrl>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @qm: pointer to &struct v4l2_querymenu * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_querymenu(struct v4l2_ctrl_handler *hdl, struct v4l2_querymenu *qm); /** * v4l2_g_ctrl - Helper function to implement * :ref:`VIDIOC_G_CTRL <vidioc_g_ctrl>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @ctrl: pointer to &struct v4l2_control * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_g_ctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_control *ctrl); /** * v4l2_s_ctrl - Helper function to implement * :ref:`VIDIOC_S_CTRL <vidioc_g_ctrl>` ioctl * * @fh: pointer to &struct v4l2_fh * @hdl: pointer to &struct v4l2_ctrl_handler * * @ctrl: pointer to &struct v4l2_control * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_s_ctrl(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, struct v4l2_control *ctrl); /** * v4l2_g_ext_ctrls - Helper function to implement * :ref:`VIDIOC_G_EXT_CTRLS <vidioc_g_ext_ctrls>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_g_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *c); /** * v4l2_try_ext_ctrls - Helper function to implement * :ref:`VIDIOC_TRY_EXT_CTRLS <vidioc_g_ext_ctrls>` ioctl * * @hdl: pointer to &struct v4l2_ctrl_handler * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_try_ext_ctrls(struct v4l2_ctrl_handler *hdl, struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *c); /** * v4l2_s_ext_ctrls - Helper function to implement * :ref:`VIDIOC_S_EXT_CTRLS <vidioc_g_ext_ctrls>` ioctl * * @fh: pointer to &struct v4l2_fh * @hdl: pointer to &struct v4l2_ctrl_handler * @vdev: pointer to &struct video_device * @mdev: pointer to &struct media_device * @c: pointer to &struct v4l2_ext_controls * * If hdl == NULL then they will all return -EINVAL. */ int v4l2_s_ext_ctrls(struct v4l2_fh *fh, struct v4l2_ctrl_handler *hdl, struct video_device *vdev, struct media_device *mdev, struct v4l2_ext_controls *c); /** * v4l2_ctrl_subdev_subscribe_event - Helper function to implement * as a &struct v4l2_subdev_core_ops subscribe_event function * that just subscribes control events. * * @sd: pointer to &struct v4l2_subdev * @fh: pointer to &struct v4l2_fh * @sub: pointer to &struct v4l2_event_subscription */ int v4l2_ctrl_subdev_subscribe_event(struct v4l2_subdev *sd, struct v4l2_fh *fh, struct v4l2_event_subscription *sub); /** * v4l2_ctrl_subdev_log_status - Log all controls owned by subdev's control * handler. * * @sd: pointer to &struct v4l2_subdev */ int v4l2_ctrl_subdev_log_status(struct v4l2_subdev *sd); /** * v4l2_ctrl_new_fwnode_properties() - Register controls for the device * properties * * @hdl: pointer to &struct v4l2_ctrl_handler to register controls on * @ctrl_ops: pointer to &struct v4l2_ctrl_ops to register controls with * @p: pointer to &struct v4l2_fwnode_device_properties * * This function registers controls associated to device properties, using the * property values contained in @p parameter, if the property has been set to * a value. * * Currently the following v4l2 controls are parsed and registered: * - V4L2_CID_CAMERA_ORIENTATION * - V4L2_CID_CAMERA_SENSOR_ROTATION; * * Controls already registered by the caller with the @hdl control handler are * not overwritten. Callers should register the controls they want to handle * themselves before calling this function. * * Return: 0 on success, a negative error code on failure. */ int v4l2_ctrl_new_fwnode_properties(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ctrl_ops, const struct v4l2_fwnode_device_properties *p); /** * v4l2_ctrl_type_op_equal - Default v4l2_ctrl_type_ops equal callback. * * @ctrl: The v4l2_ctrl pointer. * @ptr1: A v4l2 control value. * @ptr2: A v4l2 control value. * * Return: true if values are equal, otherwise false. */ bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2); /** * v4l2_ctrl_type_op_init - Default v4l2_ctrl_type_ops init callback. * * @ctrl: The v4l2_ctrl pointer. * @from_idx: Starting element index. * @ptr: The v4l2 control value. * * Return: void */ void v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr); /** * v4l2_ctrl_type_op_log - Default v4l2_ctrl_type_ops log callback. * * @ctrl: The v4l2_ctrl pointer. * * Return: void */ void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl); /** * v4l2_ctrl_type_op_validate - Default v4l2_ctrl_type_ops validate callback. * * @ctrl: The v4l2_ctrl pointer. * @ptr: The v4l2 control value. * * Return: 0 on success, a negative error code on failure. */ int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr); #endif
309 9646 4204 11059 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } #endif
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <linux/ethtool.h> #include "ipvlan.h" static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan; unsigned int flags; int err; ASSERT_RTNL(); if (port->mode != nval) { list_for_each_entry(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { err = dev_change_flags(ipvlan->dev, flags | IFF_NOARP, extack); } else { err = dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, extack); } if (unlikely(err)) goto fail; } if (nval == IPVLAN_MODE_L3S) { /* New mode is L3S */ err = ipvlan_l3s_register(port); if (err) goto fail; } else if (port->mode == IPVLAN_MODE_L3S) { /* Old mode was L3S */ ipvlan_l3s_unregister(port); } port->mode = nval; } return 0; fail: /* Undo the flags changes that have been done so far. */ list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (port->mode == IPVLAN_MODE_L3 || port->mode == IPVLAN_MODE_L3S) dev_change_flags(ipvlan->dev, flags | IFF_NOARP, NULL); else dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, NULL); } return err; } static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; int err, idx; port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); if (!port) return -ENOMEM; write_pnet(&port->pnet, dev_net(dev)); port->dev = dev; port->mode = IPVLAN_MODE_L3; INIT_LIST_HEAD(&port->ipvlans); for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) INIT_HLIST_HEAD(&port->hlhead[idx]); skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); port->dev_id_start = 1; err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) goto err; netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; err: kfree(port); return err; } static void ipvlan_port_destroy(struct net_device *dev) { struct ipvl_port *port = ipvlan_port_get_rtnl(dev); struct sk_buff *skb; netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); while ((skb = __skb_dequeue(&port->backlog)) != NULL) { dev_put(skb->dev); kfree_skb(skb); } ida_destroy(&port->ida); kfree(port); } #define IPVLAN_ALWAYS_ON_OFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) #define IPVLAN_ALWAYS_ON \ (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_VLAN_CHALLENGED) #define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ #define IPVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static int ipvlan_init(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; int err; dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); dev->features = phy_dev->features & IPVLAN_FEATURES; dev->features |= IPVLAN_ALWAYS_ON; dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, phy_dev); dev->hard_header_len = phy_dev->hard_header_len; netdev_lockdep_set_classes(dev); ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; if (!netif_is_ipvlan_port(phy_dev)) { err = ipvlan_port_create(phy_dev); if (err < 0) { free_percpu(ipvlan->pcpu_stats); return err; } } port = ipvlan_port_get_rtnl(phy_dev); port->count += 1; return 0; } static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; free_percpu(ipvlan->pcpu_stats); port = ipvlan_port_get_rtnl(phy_dev); port->count -= 1; if (!port->count) ipvlan_port_destroy(port->dev); } static int ipvlan_open(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr; if (ipvlan->port->mode == IPVLAN_MODE_L3 || ipvlan->port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); rcu_read_unlock(); return 0; } static int ipvlan_stop(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_addr *addr; dev_uc_unsync(phy_dev, dev); dev_mc_unsync(phy_dev, dev); rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_del(addr); rcu_read_unlock(); return 0; } static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); int skblen = skb->len; int ret; ret = ipvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->tx_pkts); u64_stats_add(&pcptr->tx_bytes, skblen); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->tx_drps); } return ret; } static netdev_features_t ipvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct ipvl_dev *ipvlan = netdev_priv(dev); features |= NETIF_F_ALL_FOR_ALL; features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); features = netdev_increment_features(ipvlan->phy_dev->features, features, features); features |= IPVLAN_ALWAYS_ON; features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); return features; } static void ipvlan_change_rx_flags(struct net_device *dev, int change) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); } static void ipvlan_set_multicast_mac_filter(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); } else { struct netdev_hw_addr *ha; DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); netdev_for_each_mc_addr(ha, dev) __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); /* Turn-on broadcast bit irrespective of address family, * since broadcast is deferred to a work-queue, hence no * impact on fast-path processing. */ __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters); bitmap_copy(ipvlan->mac_filters, mc_filters, IPVLAN_MAC_FILTER_SIZE); } dev_uc_sync(ipvlan->phy_dev, dev); dev_mc_sync(ipvlan->phy_dev, dev); } static void ipvlan_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (ipvlan->pcpu_stats) { struct ipvl_pcpu_stats *pcptr; u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; u32 rx_errs = 0, tx_drps = 0; u32 strt; int idx; for_each_possible_cpu(idx) { pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); do { strt = u64_stats_fetch_begin(&pcptr->syncp); rx_pkts = u64_stats_read(&pcptr->rx_pkts); rx_bytes = u64_stats_read(&pcptr->rx_bytes); rx_mcast = u64_stats_read(&pcptr->rx_mcast); tx_pkts = u64_stats_read(&pcptr->tx_pkts); tx_bytes = u64_stats_read(&pcptr->tx_bytes); } while (u64_stats_fetch_retry(&pcptr->syncp, strt)); s->rx_packets += rx_pkts; s->rx_bytes += rx_bytes; s->multicast += rx_mcast; s->tx_packets += tx_pkts; s->tx_bytes += tx_bytes; /* u32 values are updated without syncp protection. */ rx_errs += READ_ONCE(pcptr->rx_errs); tx_drps += READ_ONCE(pcptr->tx_drps); } s->rx_errors = rx_errs; s->rx_dropped = rx_errs; s->tx_dropped = tx_drps; } s->tx_errors = DEV_STATS_READ(dev, tx_errors); } static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; return vlan_vid_add(phy_dev, proto, vid); } static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; vlan_vid_del(phy_dev, proto, vid); return 0; } static int ipvlan_get_iflink(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return READ_ONCE(ipvlan->phy_dev->ifindex); } static const struct net_device_ops ipvlan_netdev_ops = { .ndo_init = ipvlan_init, .ndo_uninit = ipvlan_uninit, .ndo_open = ipvlan_open, .ndo_stop = ipvlan_stop, .ndo_start_xmit = ipvlan_start_xmit, .ndo_fix_features = ipvlan_fix_features, .ndo_change_rx_flags = ipvlan_change_rx_flags, .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, .ndo_get_stats64 = ipvlan_get_stats64, .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, .ndo_get_iflink = ipvlan_get_iflink, }; static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; /* TODO Probably use a different field than dev_addr so that the * mac-address on the virtual device is portable and can be carried * while the packets use the mac-addr on the physical device. */ return dev_hard_header(skb, phy_dev, type, daddr, saddr ? : phy_dev->dev_addr, len); } static const struct header_ops ipvlan_header_ops = { .create = ipvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) { ipvlan->dev->mtu = dev->mtu; } static bool netif_is_ipvlan(const struct net_device *dev) { /* both ipvlan and ipvtap devices use the same netdev_ops */ return dev->netdev_ops == &ipvlan_netdev_ops; } static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd); } static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); strscpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); } static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return ipvlan->msg_enable; } static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) { struct ipvl_dev *ipvlan = netdev_priv(dev); ipvlan->msg_enable = value; } static const struct ethtool_ops ipvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = ipvlan_ethtool_get_link_ksettings, .get_drvinfo = ipvlan_ethtool_get_drvinfo, .get_msglevel = ipvlan_ethtool_get_msglevel, .set_msglevel = ipvlan_ethtool_set_msglevel, }; static int ipvlan_nl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int err = 0; if (!data) return 0; if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, nmode, extack); } if (!err && data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (flags & IPVLAN_F_PRIVATE) ipvlan_mark_private(port); else ipvlan_clear_private(port); if (flags & IPVLAN_F_VEPA) ipvlan_mark_vepa(port); else ipvlan_clear_vepa(port); } return err; } static size_t ipvlan_nl_getsize(const struct net_device *dev) { return (0 + nla_total_size(2) /* IFLA_IPVLAN_MODE */ + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */ ); } static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return 0; if (data[IFLA_IPVLAN_MODE]) { u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); if (mode >= IPVLAN_MODE_MAX) return -EINVAL; } if (data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); /* Only two bits are used at this moment. */ if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; /* Also both flags can't be active at the same time. */ if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) == (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; } return 0; } static int ipvlan_nl_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int ret = -EINVAL; if (!port) goto err; ret = -EMSGSIZE; if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) goto err; if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags)) goto err; return 0; err: return ret; } int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port; struct net_device *phy_dev; int err; u16 mode = IPVLAN_MODE_L3; if (!tb[IFLA_LINK]) return -EINVAL; phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!phy_dev) return -ENODEV; if (netif_is_ipvlan(phy_dev)) { struct ipvl_dev *tmp = netdev_priv(phy_dev); phy_dev = tmp->phy_dev; if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; } else if (!netif_is_ipvlan_port(phy_dev)) { /* Exit early if the underlying link is invalid or busy */ if (phy_dev->type != ARPHRD_ETHER || phy_dev->flags & IFF_LOOPBACK) { netdev_err(phy_dev, "Master is either lo or non-ether device\n"); return -EINVAL; } if (netdev_is_rx_handler_busy(phy_dev)) { netdev_err(phy_dev, "Device is already in use.\n"); return -EBUSY; } } ipvlan->phy_dev = phy_dev; ipvlan->dev = dev; ipvlan->sfeatures = IPVLAN_FEATURES; if (!tb[IFLA_MTU]) ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); spin_lock_init(&ipvlan->addrs_lock); /* TODO Probably put random address here to be presented to the * world but keep using the physical-dev address for the outgoing * packets. */ eth_hw_addr_set(dev, phy_dev->dev_addr); dev->priv_flags |= IFF_NO_RX_HANDLER; err = register_netdevice(dev); if (err < 0) return err; /* ipvlan_init() would have created the port, if required */ port = ipvlan_port_get_rtnl(phy_dev); ipvlan->port = port; /* If the port-id base is at the MAX value, then wrap it around and * begin from 0x1 again. This may be due to a busy system where lots * of slaves are getting created and deleted. */ if (port->dev_id_start == 0xFFFE) port->dev_id_start = 0x1; /* Since L2 address is shared among all IPvlan slaves including * master, use unique 16 bit dev-ids to differentiate among them. * Assign IDs between 0x1 and 0xFFFE (used by the master) to each * slave link [see addrconf_ifid_eui48()]. */ err = ida_alloc_range(&port->ida, port->dev_id_start, 0xFFFD, GFP_KERNEL); if (err < 0) err = ida_alloc_range(&port->ida, 0x1, port->dev_id_start - 1, GFP_KERNEL); if (err < 0) goto unregister_netdev; dev->dev_id = err; /* Increment id-base to the next slot for the future assignment */ port->dev_id_start = err + 1; err = netdev_upper_dev_link(phy_dev, dev, extack); if (err) goto remove_ida; /* Flags are per port and latest update overrides. User has * to be consistent in setting it just like the mode attribute. */ if (data && data[IFLA_IPVLAN_FLAGS]) port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (data && data[IFLA_IPVLAN_MODE]) mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, mode, extack); if (err) goto unlink_netdev; list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); netif_stacked_transfer_operstate(phy_dev, dev); return 0; unlink_netdev: netdev_upper_dev_unlink(phy_dev, dev); remove_ida: ida_free(&port->ida, dev->dev_id); unregister_netdev: unregister_netdevice(dev); return err; } EXPORT_SYMBOL_GPL(ipvlan_link_new); void ipvlan_link_delete(struct net_device *dev, struct list_head *head) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); kfree_rcu(addr, rcu); } spin_unlock_bh(&ipvlan->addrs_lock); ida_free(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(ipvlan->phy_dev, dev); } EXPORT_SYMBOL_GPL(ipvlan_link_delete); void ipvlan_link_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; dev->netdev_ops = &ipvlan_netdev_ops; dev->needs_free_netdev = true; dev->header_ops = &ipvlan_header_ops; dev->ethtool_ops = &ipvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(ipvlan_link_setup); static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = { [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 }, }; static struct net *ipvlan_get_link_net(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return dev_net(ipvlan->phy_dev); } static struct rtnl_link_ops ipvlan_link_ops = { .kind = "ipvlan", .priv_size = sizeof(struct ipvl_dev), .setup = ipvlan_link_setup, .newlink = ipvlan_link_new, .dellink = ipvlan_link_delete, .get_link_net = ipvlan_get_link_net, }; int ipvlan_link_register(struct rtnl_link_ops *ops) { ops->get_size = ipvlan_nl_getsize; ops->policy = ipvlan_nl_policy; ops->validate = ipvlan_nl_validate; ops->fill_info = ipvlan_nl_fillinfo; ops->changelink = ipvlan_nl_changelink; ops->maxtype = IFLA_IPVLAN_MAX; return rtnl_link_register(ops); } EXPORT_SYMBOL_GPL(ipvlan_link_register); static int ipvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvl_dev *ipvlan, *next; struct ipvl_port *port; LIST_HEAD(lst_kill); int err; if (!netif_is_ipvlan_port(dev)) return NOTIFY_DONE; port = ipvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) netif_stacked_transfer_operstate(ipvlan->phy_dev, ipvlan->dev); break; case NETDEV_REGISTER: { struct net *oldnet, *newnet = dev_net(dev); oldnet = read_pnet(&port->pnet); if (net_eq(newnet, oldnet)) break; write_pnet(&port->pnet, newnet); if (port->mode == IPVLAN_MODE_L3S) ipvlan_migrate_l3s_hook(oldnet, newnet); break; } case NETDEV_UNREGISTER: if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode) ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, &lst_kill); unregister_netdevice_many(&lst_kill); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { netif_inherit_tso_max(ipvlan->dev, dev); netdev_update_features(ipvlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(ipvlan, &port->ipvlans, pnode) ipvlan_adjust_mtu(ipvlan, dev); break; case NETDEV_PRE_CHANGEADDR: prechaddr_info = ptr; list_for_each_entry(ipvlan, &port->ipvlans, pnode) { err = dev_pre_changeaddr_notify(ipvlan->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); } break; case NETDEV_CHANGEADDR: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { eth_hw_addr_set(ipvlan->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); } break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: list_for_each_entry(ipvlan, &port->ipvlans, pnode) call_netdevice_notifiers(event, ipvlan->dev); } return NOTIFY_DONE; } /* the caller must held the addrs lock */ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); if (!addr) return -ENOMEM; addr->master = ipvlan; if (!is_v6) { memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); addr->atype = IPVL_IPV4; #if IS_ENABLED(CONFIG_IPV6) } else { memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); addr->atype = IPVL_IPV6; #endif } list_add_tail_rcu(&addr->anode, &ipvlan->addrs); /* If the interface is not up, the address will be added to the hash * list by ipvlan_open. */ if (netif_running(ipvlan->dev)) ipvlan_ht_addr_add(ipvlan, addr); return 0; } static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; spin_lock_bh(&ipvlan->addrs_lock); addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); if (!addr) { spin_unlock_bh(&ipvlan->addrs_lock); return; } ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); spin_unlock_bh(&ipvlan->addrs_lock); kfree_rcu(addr, rcu); } static bool ipvlan_is_valid_dev(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (!netif_is_ipvlan(dev)) return false; if (!ipvlan || !ipvlan->port) return false; return true; } #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip6_addr, true); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { return ipvlan_del_addr(ipvlan, ip6_addr, true); } static int ipvlan_addr6_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if6->idev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_add_addr6(ipvlan, &if6->addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ipvlan_del_addr6(ipvlan, &if6->addr); break; } return NOTIFY_OK; } static int ipvlan_addr6_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { NL_SET_ERR_MSG(i6vi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } #endif static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip4_addr, false); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { return ipvlan_del_addr(ipvlan, ip4_addr, false); } static int ipvlan_addr4_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); struct in_addr ip4_addr; if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: ip4_addr.s_addr = if4->ifa_address; if (ipvlan_add_addr4(ipvlan, &ip4_addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ip4_addr.s_addr = if4->ifa_address; ipvlan_del_addr4(ipvlan, &ip4_addr); break; } return NOTIFY_OK; } static int ipvlan_addr4_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_validator_info *ivi = (struct in_validator_info *)ptr; struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { NL_SET_ERR_MSG(ivi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_event, }; static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_validator_event, }; static struct notifier_block ipvlan_notifier_block __read_mostly = { .notifier_call = ipvlan_device_event, }; #if IS_ENABLED(CONFIG_IPV6) static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_event, }; static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_validator_event, }; #endif static int __init ipvlan_init_module(void) { int err; ipvlan_init_secret(); register_netdevice_notifier(&ipvlan_notifier_block); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&ipvlan_addr6_notifier_block); register_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif register_inetaddr_notifier(&ipvlan_addr4_notifier_block); register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); err = ipvlan_l3s_init(); if (err < 0) goto error; err = ipvlan_link_register(&ipvlan_link_ops); if (err < 0) { ipvlan_l3s_cleanup(); goto error; } return 0; error: unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif unregister_netdevice_notifier(&ipvlan_notifier_block); return err; } static void __exit ipvlan_cleanup_module(void) { rtnl_link_unregister(&ipvlan_link_ops); ipvlan_l3s_cleanup(); unregister_netdevice_notifier(&ipvlan_notifier_block); unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif } module_init(ipvlan_init_module); module_exit(ipvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>"); MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); MODULE_ALIAS_RTNL_LINK("ipvlan");
11 11 2 1 1 1 1 2 1 1 2 9 9 1 9 9 1 8 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 // SPDX-License-Identifier: GPL-2.0-only /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file implements UBIFS initialization and VFS superblock operations. Some * initialization stuff which is rather large and complex is placed at * corresponding subsystems, but most of it is here. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" static int ubifs_default_version_set(const char *val, const struct kernel_param *kp) { int n = 0, ret; ret = kstrtoint(val, 10, &n); if (ret != 0 || n < 4 || n > UBIFS_FORMAT_VERSION) return -EINVAL; return param_set_int(val, kp); } static const struct kernel_param_ops ubifs_default_version_ops = { .set = ubifs_default_version_set, .get = param_get_int, }; int ubifs_default_version = UBIFS_FORMAT_VERSION; module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_version, 0600); /* * Maximum amount of memory we may 'kmalloc()' without worrying that we are * allocating too much. */ #define UBIFS_KMALLOC_OK (128*1024) /* Slab cache for UBIFS inodes */ static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. * @c: UBIFS file-system description object * @inode: the inode to validate * * This is a helper function for 'ubifs_iget()' which validates various fields * of a newly built inode to make sure they contain sane values and prevent * possible vulnerabilities. Returns zero if the inode is all right and * a non-zero error code if not. */ static int validate_inode(struct ubifs_info *c, const struct inode *inode) { int err; const struct ubifs_inode *ui = ubifs_inode(inode); if (inode->i_size > c->max_inode_sz) { ubifs_err(c, "inode is too large (%lld)", (long long)inode->i_size); return 1; } if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { ubifs_err(c, "unknown compression type %d", ui->compr_type); return 2; } if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) return 3; if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(c, ui->compr_type)) { ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(c, ui->compr_type)); } err = dbg_check_dir(c, inode); return err; } struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) { int err; union ubifs_key key; struct ubifs_ino_node *ino; struct ubifs_info *c = sb->s_fs_info; struct inode *inode; struct ubifs_inode *ui; dbg_gen("inode %lu", inum); inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ui = ubifs_inode(inode); ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); if (!ino) { err = -ENOMEM; goto out; } ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_lookup(c, &key, ino); if (err) goto out_ino; inode->i_flags |= S_NOCMTIME; if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) inode->i_flags |= S_NOATIME; set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); ui->data_len = le32_to_cpu(ino->data_len); ui->flags = le32_to_cpu(ino->flags); ui->compr_type = le16_to_cpu(ino->compr_type); ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); ui->xattr_size = le32_to_cpu(ino->xattr_size); ui->xattr_names = le32_to_cpu(ino->xattr_names); ui->synced_i_size = ui->ui_size = inode->i_size; ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; err = validate_inode(c, inode); if (err) goto out_invalid; switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; inode->i_op = &ubifs_file_inode_operations; inode->i_fop = &ubifs_file_operations; if (ui->xattr) { ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; } else if (ui->data_len != 0) { err = 10; goto out_invalid; } break; case S_IFDIR: inode->i_op = &ubifs_dir_inode_operations; inode->i_fop = &ubifs_dir_operations; if (ui->data_len != 0) { err = 11; goto out_invalid; } break; case S_IFLNK: inode->i_op = &ubifs_symlink_inode_operations; if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { err = 12; goto out_invalid; } ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; break; case S_IFBLK: case S_IFCHR: { dev_t rdev; union ubifs_dev_desc *dev; ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } dev = (union ubifs_dev_desc *)ino->data; if (ui->data_len == sizeof(dev->new)) rdev = new_decode_dev(le32_to_cpu(dev->new)); else if (ui->data_len == sizeof(dev->huge)) rdev = huge_decode_dev(le64_to_cpu(dev->huge)); else { err = 13; goto out_invalid; } memcpy(ui->data, ino->data, ui->data_len); inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } case S_IFSOCK: case S_IFIFO: inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, 0); if (ui->data_len != 0) { err = 14; goto out_invalid; } break; default: err = 15; goto out_invalid; } kfree(ino); ubifs_set_inode_flags(inode); unlock_new_inode(inode); return inode; out_invalid: ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } static struct inode *ubifs_alloc_inode(struct super_block *sb) { struct ubifs_inode *ui; ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS); if (!ui) return NULL; memset((void *)ui + sizeof(struct inode), 0, sizeof(struct ubifs_inode) - sizeof(struct inode)); mutex_init(&ui->ui_mutex); init_rwsem(&ui->xattr_sem); spin_lock_init(&ui->ui_lock); return &ui->vfs_inode; }; static void ubifs_free_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); fscrypt_free_inode(inode); kmem_cache_free(ubifs_inode_slab, ui); } /* * Note, Linux write-back code calls this without 'i_mutex'. */ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, !ui->xattr); if (is_bad_inode(inode)) return 0; mutex_lock(&ui->ui_mutex); /* * Due to races between write-back forced by budgeting * (see 'sync_some_inodes()') and background write-back, the inode may * have already been synchronized, do not do this again. This might * also happen if it was synchronized in an VFS operation, e.g. * 'ubifs_link()'. */ if (!ui->dirty) { mutex_unlock(&ui->ui_mutex); return 0; } /* * As an optimization, do not write orphan inodes to the media just * because this is not needed. */ dbg_gen("inode %lu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) ubifs_err(c, "can't write inode %lu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); ubifs_release_dirty_inode_budget(c, ui); return err; } static int ubifs_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); return drop; } static void ubifs_evict_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have * limited usage, so there is nothing to do here. */ goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(c, !atomic_read(&inode->i_count)); truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; if (is_bad_inode(inode)) goto out; ui->ui_size = inode->i_size = 0; err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ ubifs_err(c, "can't delete inode %lu, error %d", inode->i_ino, err); out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); else { /* We've deleted something - clean the "no space" flags */ c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } done: clear_inode(inode); fscrypt_put_encryption_info(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) { struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); if (!ui->dirty) { ui->dirty = 1; dbg_gen("inode %lu", inode->i_ino); } } static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; __le32 *uuid = (__le32 *)c->uuid; free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); buf->f_type = UBIFS_SUPER_MAGIC; buf->f_bsize = UBIFS_BLOCK_SIZE; buf->f_blocks = c->block_cnt; buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; if (free > c->report_rp_size) buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; else buf->f_bavail = 0; buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); ubifs_assert(c, buf->f_bfree <= c->block_cnt); return 0; } static int ubifs_show_options(struct seq_file *s, struct dentry *root) { struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", ubifs_compr_name(c, c->mount_opts.compr_type)); } seq_printf(s, ",assert=%s", ubifs_assert_action_name(c)); seq_printf(s, ",ubi=%d,vol=%d", c->vi.ubi_num, c->vi.vol_id); return 0; } static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; /* * Zero @wait is just an advisory thing to help the file system shove * lots of data into the queues, and there will be the second * '->sync_fs()' call, with non-zero @wait. */ if (!wait) return 0; /* * Synchronize write buffers, because 'ubifs_run_commit()' does not * do this if it waits for an already running commit. */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) return err; } /* * Strictly speaking, it is not necessary to commit the journal here, * synchronizing write-buffers would be enough. But committing makes * UBIFS free space predictions much more accurate, so we want to let * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ err = ubifs_run_commit(c); if (err) return err; return ubi_sync(c->vi.ubi_num); } /** * init_constants_early - initialize UBIFS constants. * @c: UBIFS file-system description object * * This function initialize UBIFS constants which do not need the superblock to * be read. It also checks that the UBI volume satisfies basic UBIFS * requirements. Returns zero in case of success and a negative error code in * case of failure. */ static int init_constants_early(struct ubifs_info *c) { if (c->vi.corrupted) { ubifs_warn(c, "UBI volume is corrupted - read-only mode"); c->ro_media = 1; } if (c->di.ro_mode) { ubifs_msg(c, "read-only UBI device"); c->ro_media = 1; } if (c->vi.vol_type == UBI_STATIC_VOLUME) { ubifs_msg(c, "static UBI volume - read-only mode"); c->ro_media = 1; } c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; c->max_write_size = c->di.max_write_size; c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_errc(c, "too few LEBs (%d), min. is %d", c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", c->max_write_size, c->min_io_size); return -EINVAL; } /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. */ if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; if (c->max_write_size < c->min_io_size) { c->max_write_size = c->min_io_size; c->max_write_shift = c->min_io_shift; } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); /* * Initialize node length ranges which are mostly needed for node * length validation. */ c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ + UBIFS_MAX_HMAC_LEN; c->ranges[UBIFS_SIG_NODE].min_len = UBIFS_SIG_NODE_SZ; c->ranges[UBIFS_SIG_NODE].max_len = c->leb_size - UBIFS_SB_NODE_SZ; c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; c->ranges[UBIFS_ORPH_NODE].min_len = UBIFS_ORPH_NODE_SZ + sizeof(__le64); c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; /* * Minimum indexing node size is amended later when superblock is * read and the key length is known. */ c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; /* * Maximum indexing node size is amended later when superblock is * read and the fanout is known. */ c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; /* * Initialize dead and dark LEB space watermarks. See gc.c for comments * about these values. */ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); /* * Calculate how many bytes would be wasted at the end of LEB if it was * fully filled with data nodes of maximum size. This is used in * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; /* Buffer size for bulk-reads */ c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; if (c->max_bu_buf_len > c->leb_size) c->max_bu_buf_len = c->leb_size; /* Log is ready, preserve one LEB for commits. */ c->min_log_bytes = c->leb_size; return 0; } /** * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. * @c: UBIFS file-system description object * @lnum: LEB the write-buffer was synchronized to * @free: how many free bytes left in this LEB * @pad: how many bytes were padded * * This is a callback function which is called by the I/O unit when the * write-buffer is synchronized. We need this to correctly maintain space * accounting in bud logical eraseblocks. This function returns zero in case of * success and a negative error code in case of failure. * * This function actually belongs to the journal, but we keep it here because * we want to keep it static. */ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) { return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); } /* * init_constants_sb - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the superblock has been read. It also checks various UBIFS parameters and * makes sure they are all right. Returns zero in case of success and a * negative error code in case of failure. */ static int init_constants_sb(struct ubifs_info *c) { int tmp, err; long long tmp64; c->main_bytes = (long long)c->main_lebs * c->leb_size; c->max_znode_sz = sizeof(struct ubifs_znode) + c->fanout * sizeof(struct ubifs_zbranch); tmp = ubifs_idx_node_sz(c, 1); c->ranges[UBIFS_IDX_NODE].min_len = tmp; c->min_idx_node_sz = ALIGN(tmp, 8); tmp = ubifs_idx_node_sz(c, c->fanout); c->ranges[UBIFS_IDX_NODE].max_len = tmp; c->max_idx_node_sz = ALIGN(tmp, 8); /* Make sure LEB size is large enough to fit full commit */ tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { ubifs_err(c, "too small LEB size %d, at least %d needed", c->leb_size, tmp); return -EINVAL; } /* * Make sure that the log is large enough to fit reference nodes for * all buds plus one reserved LEB. */ tmp64 = c->max_bud_bytes + c->leb_size - 1; c->max_bud_cnt = div_u64(tmp64, c->leb_size); tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { ubifs_err(c, "too small log %d LEBs, required min. %d LEBs", c->log_lebs, tmp); return -EINVAL; } /* * When budgeting we assume worst-case scenarios when the pages are not * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so * it is not included into 'c->bi.inode_budget'. */ c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; c->bi.inode_budget = UBIFS_INO_NODE_SZ; c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. * The writers are unblocked when the commit is finished. To avoid * writers to be blocked UBIFS initiates background commit in advance, * when number of bud bytes becomes above the limit defined below. */ c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; /* * Ensure minimum journal size. All the bytes in the journal heads are * considered to be used, when calculating the current journal usage. * Consequently, if the journal is too small, UBIFS will treat it as * always full. */ tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; if (c->bg_bud_bytes < tmp64) c->bg_bud_bytes = tmp64; if (c->max_bud_bytes < tmp64 + c->leb_size) c->max_bud_bytes = tmp64 + c->leb_size; err = ubifs_calc_lpt_geom(c); if (err) return err; /* Initialize effective LEB size used in budgeting calculations */ c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } /* * init_constants_master - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the master node has been read. It also checks various UBIFS parameters and * makes sure they are all right. */ static void init_constants_master(struct ubifs_info *c) { long long tmp64; c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * * Subtract the LEB reserved for GC, the LEB which is reserved for * deletions, minimum LEBs for the index, the LEBs which are reserved * for each journal head. */ tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt; tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; } /** * take_gc_lnum - reserve GC LEB. * @c: UBIFS file-system description object * * This function ensures that the LEB reserved for garbage collection is marked * as "taken" in lprops. We also have to set free space to LEB size and dirty * space to zero, because lprops may contain out-of-date information if the * file-system was un-mounted before it has been committed. This function * returns zero in case of success and a negative error code in case of * failure. */ static int take_gc_lnum(struct ubifs_info *c) { int err; if (c->gc_lnum == -1) { ubifs_err(c, "no LEB for GC"); return -EINVAL; } /* And we have to tell lprops that this LEB is taken */ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, LPROPS_TAKEN, 0, 0); return err; } /** * alloc_wbufs - allocate write-buffers. * @c: UBIFS file-system description object * * This helper function allocates and initializes UBIFS write-buffers. Returns * zero in case of success and %-ENOMEM in case of failure. */ static int alloc_wbufs(struct ubifs_info *c) { int i, err; c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead), GFP_KERNEL); if (!c->jheads) return -ENOMEM; /* Initialize journal heads */ for (i = 0; i < c->jhead_cnt; i++) { INIT_LIST_HEAD(&c->jheads[i].buds_list); err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); if (err) goto out_wbuf; c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; c->jheads[i].grouped = 1; c->jheads[i].log_hash = ubifs_hash_get_desc(c); if (IS_ERR(c->jheads[i].log_hash)) { err = PTR_ERR(c->jheads[i].log_hash); goto out_log_hash; } } /* * Garbage Collector head does not need to be synchronized by timer. * Also GC head nodes are not grouped. */ c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; return 0; out_log_hash: kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); out_wbuf: while (i--) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; return err; } /** * free_wbufs - free write-buffers. * @c: UBIFS file-system description object */ static void free_wbufs(struct ubifs_info *c) { int i; if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; } } /** * free_orphans - free orphans. * @c: UBIFS file-system description object */ static void free_orphans(struct ubifs_info *c) { struct ubifs_orphan *orph; while (c->orph_dnext) { orph = c->orph_dnext; c->orph_dnext = orph->dnext; list_del(&orph->list); kfree(orph); } while (!list_empty(&c->orph_list)) { orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); ubifs_err(c, "orphan list not empty at unmount"); } vfree(c->orph_buf); c->orph_buf = NULL; } /** * free_buds - free per-bud objects. * @c: UBIFS file-system description object */ static void free_buds(struct ubifs_info *c) { struct ubifs_bud *bud, *n; rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) { kfree(bud->log_hash); kfree(bud); } } /** * check_volume_empty - check if the UBI volume is empty. * @c: UBIFS file-system description object * * This function checks if the UBIFS volume is empty by looking if its LEBs are * mapped or not. The result of checking is stored in the @c->empty variable. * Returns zero in case of success and a negative error code in case of * failure. */ static int check_volume_empty(struct ubifs_info *c) { int lnum, err; c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { c->empty = 0; break; } cond_resched(); } return 0; } /* * UBIFS mount options. * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting * Opt_bulk_read: enable bulk-reads * Opt_no_bulk_read: disable bulk-reads * Opt_chk_data_crc: check CRCs when reading data nodes * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_override_compr: override default compressor * Opt_assert: set ubifs_assert() action * Opt_auth_key: The key name used for authentication * Opt_auth_hash_name: The hash type used for authentication * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, Opt_bulk_read, Opt_no_bulk_read, Opt_chk_data_crc, Opt_no_chk_data_crc, Opt_override_compr, Opt_assert, Opt_auth_key, Opt_auth_hash_name, Opt_ignore, }; static const struct constant_table ubifs_param_compr[] = { { "none", UBIFS_COMPR_NONE }, { "lzo", UBIFS_COMPR_LZO }, { "zlib", UBIFS_COMPR_ZLIB }, { "zstd", UBIFS_COMPR_ZSTD }, {} }; static const struct constant_table ubifs_param_assert[] = { { "report", ASSACT_REPORT }, { "read-only", ASSACT_RO }, { "panic", ASSACT_PANIC }, {} }; static const struct fs_parameter_spec ubifs_fs_param_spec[] = { fsparam_flag ("fast_unmount", Opt_fast_unmount), fsparam_flag ("norm_unmount", Opt_norm_unmount), fsparam_flag ("bulk_read", Opt_bulk_read), fsparam_flag ("no_bulk_read", Opt_no_bulk_read), fsparam_flag ("chk_data_crc", Opt_chk_data_crc), fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), fsparam_enum ("assert", Opt_assert, ubifs_param_assert), fsparam_string ("auth_key", Opt_auth_key), fsparam_string ("auth_hash_name", Opt_auth_hash_name), fsparam_string ("ubi", Opt_ignore), fsparam_string ("vol", Opt_ignore), {} }; struct ubifs_fs_context { struct ubifs_mount_opts mount_opts; char *auth_key_name; char *auth_hash_name; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; unsigned int assert_action:2; }; /** * ubifs_parse_param - parse a parameter. * @fc: the filesystem context * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ubifs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); int opt; opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ case Opt_fast_unmount: ctx->mount_opts.unmount_mode = 2; break; case Opt_norm_unmount: ctx->mount_opts.unmount_mode = 1; break; case Opt_bulk_read: ctx->mount_opts.bulk_read = 2; ctx->bulk_read = 1; break; case Opt_no_bulk_read: ctx->mount_opts.bulk_read = 1; ctx->bulk_read = 0; break; case Opt_chk_data_crc: ctx->mount_opts.chk_data_crc = 2; ctx->no_chk_data_crc = 0; break; case Opt_no_chk_data_crc: ctx->mount_opts.chk_data_crc = 1; ctx->no_chk_data_crc = 1; break; case Opt_override_compr: ctx->mount_opts.compr_type = result.uint_32; ctx->mount_opts.override_compr = 1; ctx->default_compr = ctx->mount_opts.compr_type; break; case Opt_assert: ctx->assert_action = result.uint_32; break; case Opt_auth_key: if (!is_remount) { kfree(ctx->auth_key_name); ctx->auth_key_name = param->string; param->string = NULL; } break; case Opt_auth_hash_name: if (!is_remount) { kfree(ctx->auth_hash_name); ctx->auth_hash_name = param->string; param->string = NULL; } break; case Opt_ignore: break; } return 0; } /* * ubifs_release_options - release mount parameters which have been dumped. * @c: UBIFS file-system description object */ static void ubifs_release_options(struct ubifs_info *c) { kfree(c->auth_key_name); c->auth_key_name = NULL; kfree(c->auth_hash_name); c->auth_hash_name = NULL; } /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object * * This function destroys journal data structures including those that may have * been created by recovery functions. */ static void destroy_journal(struct ubifs_info *c) { while (!list_empty(&c->unclean_leb_list)) { struct ubifs_unclean_leb *ucleb; ucleb = list_entry(c->unclean_leb_list.next, struct ubifs_unclean_leb, list); list_del(&ucleb->list); kfree(ucleb); } while (!list_empty(&c->old_buds)) { struct ubifs_bud *bud; bud = list_entry(c->old_buds.next, struct ubifs_bud, list); list_del(&bud->list); kfree(bud->log_hash); kfree(bud); } ubifs_destroy_idx_gc(c); ubifs_destroy_size_tree(c); ubifs_tnc_close(c); free_buds(c); } /** * bu_init - initialize bulk-read information. * @c: UBIFS file-system description object */ static void bu_init(struct ubifs_info *c) { ubifs_assert(c, c->bulk_read == 1); if (c->bu.buf) return; /* Already initialized */ again: c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); if (!c->bu.buf) { if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { c->max_bu_buf_len = UBIFS_KMALLOC_OK; goto again; } /* Just disable bulk-read */ ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it", c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; return; } } /** * check_free_space - check if there is enough free space to mount. * @c: UBIFS file-system description object * * This function makes sure UBIFS has enough free space to be mounted in * read/write mode. UBIFS must always have some free space to allow deletions. */ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c, c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err(c, "insufficient free space to mount in R/W mode"); ubifs_dump_budg(c, &c->bi); ubifs_dump_lprops(c); return -ENOSPC; } return 0; } /** * mount_ubifs - mount UBIFS file-system. * @c: UBIFS file-system description object * * This function mounts UBIFS file system. Returns zero in case of success and * a negative error code in case of failure. */ static int mount_ubifs(struct ubifs_info *c) { int err; long long x, y; size_t sz; c->ro_mount = !!sb_rdonly(c->vfs_sb); /* Suppress error messages while probing if SB_SILENT is set */ c->probing = !!(c->vfs_sb->s_flags & SB_SILENT); err = init_constants_early(c); if (err) return err; err = ubifs_debugging_init(c); if (err) return err; err = ubifs_sysfs_register(c); if (err) goto out_debugging; err = check_volume_empty(c); if (err) goto out_free; if (c->empty && (c->ro_mount || c->ro_media)) { /* * This UBI volume is empty, and read-only, or the file system * is mounted read-only - we cannot format it. */ ubifs_err(c, "can't format empty UBI volume: read-only %s", c->ro_media ? "UBI volume" : "mount"); err = -EROFS; goto out_free; } if (c->ro_media && !c->ro_mount) { ubifs_err(c, "cannot mount read-write - read-only media"); err = -EROFS; goto out_free; } /* * The requirement for the buffer is that it should fit indexing B-tree * height amount of integers. We assume the height if the TNC tree will * never exceed 64. */ err = -ENOMEM; c->bottom_up_buf = kmalloc_array(BOTTOM_UP_HEIGHT, sizeof(int), GFP_KERNEL); if (!c->bottom_up_buf) goto out_free; c->sbuf = vmalloc(c->leb_size); if (!c->sbuf) goto out_free; if (!c->ro_mount) { c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) goto out_free; } if (c->bulk_read == 1) bu_init(c); if (!c->ro_mount) { c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) goto out_free; } c->mounting = 1; if (c->auth_key_name) { if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) { err = ubifs_init_authentication(c); if (err) goto out_free; } else { ubifs_err(c, "auth_key_name, but UBIFS is built without" " authentication support"); err = -EINVAL; goto out_free; } } err = ubifs_read_superblock(c); if (err) goto out_auth; c->probing = 0; /* * Make sure the compressor which is set as default in the superblock * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c, c->default_compr)) { ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; goto out_auth; } err = init_constants_sb(c); if (err) goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; goto out_auth; } err = alloc_wbufs(c); if (err) goto out_cbuf; sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out_wbufs; } } err = ubifs_read_master(c); if (err) goto out_master; init_constants_master(c); if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg(c, "recovery needed"); c->need_recovery = 1; } if (c->need_recovery && !c->ro_mount) { err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out_master; } err = ubifs_lpt_init(c, 1, !c->ro_mount); if (err) goto out_master; if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out_lpt; } if (!c->ro_mount && !c->need_recovery) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. */ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out_lpt; } /* * Handle offline signed images: Now that the master node is * written and its validation no longer depends on the hash * in the superblock, we can update the offline signed * superblock with a HMAC version, */ if (ubifs_authenticated(c) && ubifs_hmac_zero(c, c->sup_node->hmac)) { err = ubifs_hmac_wkm(c, c->sup_node->hmac_wkm); if (err) goto out_lpt; c->superblock_need_write = 1; } if (!c->ro_mount && c->superblock_need_write) { err = ubifs_write_sb_node(c, c->sup_node); if (err) goto out_lpt; c->superblock_need_write = 0; } err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; err = ubifs_replay_journal(c); if (err) goto out_journal; /* Calculate 'min_idx_lebs' after journal replay */ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) goto out_orphans; if (!c->ro_mount) { int lnum; err = check_free_space(c); if (err) goto out_orphans; /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out_orphans; } if (c->need_recovery) { if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out_orphans; } err = ubifs_rcvry_gc_commit(c); if (err) goto out_orphans; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } } else { err = take_gc_lnum(c); if (err) goto out_orphans; /* * GC LEB may contain garbage if there was an unclean * reboot, and it should be un-mapped. */ err = ubifs_leb_unmap(c, c->gc_lnum); if (err) goto out_orphans; } err = dbg_check_lprops(c); if (err) goto out_orphans; } else if (c->need_recovery) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } else { /* * Even if we mount read-only, we have to set space in GC LEB * to proper value because this affects UBIFS free space * reporting. We do not want to have a situation when * re-mounting from R/O to R/W changes amount of free space. */ err = take_gc_lnum(c); if (err) goto out_orphans; } spin_lock(&ubifs_infos_lock); list_add_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); if (c->need_recovery) { if (c->ro_mount) ubifs_msg(c, "recovery deferred"); else { c->need_recovery = 0; ubifs_msg(c, "recovery completed"); /* * GC LEB has to be empty and taken at this point. But * the journal head LEBs may also be accounted as * "empty taken" if they are empty. */ ubifs_assert(c, c->lst.taken_empty_lebs > 0); } } else ubifs_assert(c, c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) goto out_infos; dbg_debugfs_init_fs(c); c->mounting = 0; ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", c->leb_size, c->leb_size >> 10, c->min_io_size, c->max_write_size); ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), max %d LEBs, journal size %lld bytes (%lld MiB, %d LEBs)", x, x >> 20, c->main_lebs, c->max_leb_cnt, y, y >> 20, c->log_lebs + c->max_bud_cnt); ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, c->big_lpt ? ", big LPT model" : ", small LPT model"); dbg_gen("default compressor: %s", ubifs_compr_name(c, c->default_compr)); dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); dbg_gen("index LEBs: %d", c->lst.idx_lebs); dbg_gen("total index bytes: %llu (%llu KiB, %llu MiB)", c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, c->bi.old_idx_sz >> 20); dbg_gen("key hash type: %d", c->key_hash_type); dbg_gen("tree fanout: %d", c->fanout); dbg_gen("reserved GC LEB: %d", c->gc_lnum); dbg_gen("max. znode size %d", c->max_znode_sz); dbg_gen("max. index node size %d", c->max_idx_node_sz); dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); dbg_gen("node sizes: trun %zu, sb %zu, master %zu", UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_gen("dead watermark: %d", c->dead_wm); dbg_gen("dark watermark: %d", c->dark_wm); dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); dbg_gen("max. seq. number: %llu", c->max_sqnum); dbg_gen("commit number: %llu", c->cmt_no); dbg_gen("max. xattrs per inode: %d", ubifs_xattr_max_cnt(c)); dbg_gen("max orphans: %d", c->max_orphans); return 0; out_infos: spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); out_orphans: free_orphans(c); out_journal: destroy_journal(c); out_lpt: ubifs_lpt_free(c, 0); out_master: kfree(c->mst_node); kfree(c->rcvrd_mst_node); if (c->bgt) kthread_stop(c->bgt); out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); out_auth: ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_sysfs_unregister(c); out_debugging: ubifs_debugging_exit(c); return err; } /** * ubifs_umount - un-mount UBIFS file-system. * @c: UBIFS file-system description object * * Note, this function is called to free allocated resourced when un-mounting, * as well as free resources when an error occurred while we were half way * through mounting (error path cleanup function). So it has to make sure the * resource was actually allocated before freeing it. */ static void ubifs_umount(struct ubifs_info *c) { dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); dbg_debugfs_exit_fs(c); spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); if (c->bgt) kthread_stop(c->bgt); destroy_journal(c); free_wbufs(c); free_orphans(c); ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_debugging_exit(c); ubifs_sysfs_unregister(c); } /** * ubifs_remount_rw - re-mount in read-write mode. * @c: UBIFS file-system description object * * UBIFS avoids allocating many unnecessary resources when mounted in read-only * mode. This function allocates the needed resources and re-mounts UBIFS in * read-write mode. */ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; if (c->rw_incompat) { ubifs_err(c, "the file-system is not R/W-compatible"); ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; } mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; c->ro_mount = 0; if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out; } err = check_free_space(c); if (err) goto out; if (c->need_recovery) { ubifs_msg(c, "completing deferred recovery"); err = ubifs_write_rcvrd_mst_node(c); if (err) goto out; if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out; } err = ubifs_clean_lebs(c, c->sbuf); if (err) goto out; err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out; } else { /* A readonly mount is not allowed to have orphans */ ubifs_assert(c, c->tot_orphans == 0); err = ubifs_clear_orphans(c); if (err) goto out; } if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out; } if (c->superblock_need_write) { struct ubifs_sb_node *sup = c->sup_node; err = ubifs_write_sb_node(c, sup); if (err) goto out; c->superblock_need_write = 0; } c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) { err = -ENOMEM; goto out; } c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) { err = -ENOMEM; goto out; } err = ubifs_lpt_init(c, 0, 1); if (err) goto out; /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out; } c->orph_buf = vmalloc(c->leb_size); if (!c->orph_buf) { err = -ENOMEM; goto out; } /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out; } if (c->need_recovery) { err = ubifs_rcvry_gc_commit(c); if (err) goto out; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out; } } else { err = ubifs_leb_unmap(c, c->gc_lnum); } if (err) goto out; dbg_gen("re-mounted read-write"); c->remounting_rw = 0; if (c->need_recovery) { c->need_recovery = 0; ubifs_msg(c, "deferred recovery completed"); } else { /* * Do not run the debugging space check if the were doing * recovery, because when we saved the information we had the * file-system in a state where the TNC and lprops has been * modified in memory, but all the I/O operations (including a * commit) were deferred. So the file-system was in * "non-committed" state. Now the file-system is in committed * state, and of course the amount of free space will change * because, for example, the old index size was imprecise. */ err = dbg_check_space_info(c); } mutex_unlock(&c->umount_mutex); return err; out: c->ro_mount = 1; vfree(c->orph_buf); c->orph_buf = NULL; if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; mutex_unlock(&c->umount_mutex); return err; } /** * ubifs_remount_ro - re-mount in read-only mode. * @c: UBIFS file-system description object * * We assume VFS has stopped writing. Possibly the background thread could be * running a commit, however kthread_stop will wait in that case. */ static void ubifs_remount_ro(struct ubifs_info *c) { int i, err; ubifs_assert(c, !c->need_recovery); ubifs_assert(c, !c->ro_mount); mutex_lock(&c->umount_mutex); if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } dbg_save_space_info(c); for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) ubifs_ro_mode(c, err); vfree(c->orph_buf); c->orph_buf = NULL; kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->ro_mount = 1; err = dbg_check_space_info(c); if (err) ubifs_ro_mode(c, err); mutex_unlock(&c->umount_mutex); } static void ubifs_put_super(struct super_block *sb) { int i; struct ubifs_info *c = sb->s_fs_info; ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num); /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed * to write them back because of I/O errors. */ if (!c->ro_error) { ubifs_assert(c, c->bi.idx_growth == 0); ubifs_assert(c, c->bi.dd_growth == 0); ubifs_assert(c, c->bi.data_growth == 0); } /* * The 'c->umount_lock' prevents races between UBIFS memory shrinker * and file system un-mount. Namely, it prevents the shrinker from * picking this superblock for shrinking - it will be just skipped if * the mutex is locked. */ mutex_lock(&c->umount_mutex); if (!c->ro_mount) { /* * First of all kill the background thread to make sure it does * not interfere with un-mounting and freeing resources. */ if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } /* * On fatal errors c->ro_error is set to 1, in which case we do * not write the master node. */ if (!c->ro_error) { int err; /* Synchronize write-buffers */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } /* * We are being cleanly unmounted which means the * orphans were killed - indicate this in the master * node. Also save the reserved GC LEB number. */ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) /* * Recovery will attempt to fix the master area * next mount, so we just print a message and * continue to unmount normally. */ ubifs_err(c, "failed to write master node, error %d", err); } else { for (i = 0; i < c->jhead_cnt; i++) /* Make sure write-buffer timers are canceled */ hrtimer_cancel(&c->jheads[i].wbuf.timer); } } ubifs_umount(c); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } static int ubifs_reconfigure(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); /* * Apply the mount option changes. * auth_key_name and auth_hash_name are ignored on remount. */ c->mount_opts = ctx->mount_opts; c->bulk_read = ctx->bulk_read; c->no_chk_data_crc = ctx->no_chk_data_crc; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; } if (c->ro_media) { ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O"); return -EROFS; } err = ubifs_remount_rw(c); if (err) return err; } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; } ubifs_remount_ro(c); } if (c->bulk_read == 1) bu_init(c); else { dbg_gen("disable bulk-read"); mutex_lock(&c->bu_mutex); kfree(c->bu.buf); c->bu.buf = NULL; mutex_unlock(&c->bu_mutex); } if (!c->need_recovery) ubifs_assert(c, c->lst.taken_empty_lebs > 0); return 0; } const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, .drop_inode = ubifs_drop_inode, .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume * character device node path. However, UBIFS may also be mounted without any * character device node using one of the following methods: * * o ubiX_Y - mount UBI device number X, volume Y; * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function * returns UBI volume description object in case of success and a negative * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; const char *name = fc->source; int dev, vol; char *endptr; /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) return ubi; /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); /* ubiY method */ if (*endptr == '\0') return ubi_open_volume(0, dev, mode); /* ubiX_Y method */ if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') goto invalid_source; return ubi_open_volume(dev, vol, mode); } /* ubiX:NAME method */ if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); invalid_source: return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) { struct ubifs_info *c; c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); if (c) { spin_lock_init(&c->cnt_lock); spin_lock_init(&c->cs_lock); spin_lock_init(&c->buds_lock); spin_lock_init(&c->space_lock); spin_lock_init(&c->orphan_lock); init_rwsem(&c->commit_sem); mutex_init(&c->lp_mutex); mutex_init(&c->tnc_mutex); mutex_init(&c->log_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); init_waitqueue_head(&c->reserve_space_wq); atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; c->orph_tree = RB_ROOT; INIT_LIST_HEAD(&c->infos_list); INIT_LIST_HEAD(&c->idx_gc); INIT_LIST_HEAD(&c->replay_list); INIT_LIST_HEAD(&c->replay_buds); INIT_LIST_HEAD(&c->uncat_list); INIT_LIST_HEAD(&c->empty_list); INIT_LIST_HEAD(&c->freeable_list); INIT_LIST_HEAD(&c->frdi_idx_list); INIT_LIST_HEAD(&c->unclean_leb_list); INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); c->no_chk_data_crc = 1; c->assert_action = ASSACT_RO; c->highest_inum = UBIFS_FIRST_INO; c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; ubi_get_volume_info(ubi, &c->vi); ubi_get_device_info(c->vi.ubi_num, &c->di); } return c; } static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); goto out; } /* Copy in parsed mount options */ c->mount_opts = ctx->mount_opts; c->auth_key_name = ctx->auth_key_name; c->auth_hash_name = ctx->auth_hash_name; c->no_chk_data_crc = ctx->no_chk_data_crc; c->bulk_read = ctx->bulk_read; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; /* ubifs_info owns auth strings now */ ctx->auth_key_name = NULL; ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in read_folio, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also * @sb->s_bdi->capabilities are initialized to 0 so there won't be any * writeback happening. */ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, c->vi.vol_id); if (err) goto out_close; sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; fscrypt_set_ops(sb, &ubifs_crypt_operations); mutex_lock(&c->umount_mutex); err = mount_ubifs(c); if (err) { ubifs_assert(c, err < 0); goto out_unlock; } /* Read the root inode */ root = ubifs_iget(sb, UBIFS_ROOT_INO); if (IS_ERR(root)) { err = PTR_ERR(root); goto out_umount; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_umount; } super_set_uuid(sb, c->uuid, sizeof(c->uuid)); super_set_sysfs_name_generic(sb, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); mutex_unlock(&c->umount_mutex); return 0; out_umount: ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); out_close: ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; } static int sb_test(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; if (!fc->source || !*fc->source) return invalf(fc, "No source specified"); dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { err = PTR_ERR(ubi); if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", current->pid, fc->source, err); return err; } c = alloc_ubifs_info(ubi); if (!c) { err = -ENOMEM; goto out_close; } fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); goto out_close; } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ sb->s_flags |= SB_ACTIVE; if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) ubifs_msg(c, "full atime support is enabled."); else sb->s_flags |= SB_NOATIME; } /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); fc->root = dget(sb->s_root); return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); return err; } static void kill_ubifs_super(struct super_block *s) { struct ubifs_info *c = s->s_fs_info; kill_anon_super(s); kfree(c); } static void ubifs_free_fc(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; if (ctx) { kfree(ctx->auth_key_name); kfree(ctx->auth_hash_name); kfree(ctx); } } static const struct fs_context_operations ubifs_context_ops = { .free = ubifs_free_fc, .parse_param = ubifs_parse_param, .get_tree = ubifs_get_tree, .reconfigure = ubifs_reconfigure, }; static int ubifs_init_fs_context(struct fs_context *fc) { struct ubifs_fs_context *ctx; ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { /* Iniitialize for first mount */ ctx->no_chk_data_crc = 1; ctx->assert_action = ASSACT_RO; } else { struct ubifs_info *c = fc->root->d_sb->s_fs_info; /* * Preserve existing options across remounts. * auth_key_name and auth_hash_name are not remountable. */ ctx->mount_opts = c->mount_opts; ctx->bulk_read = c->bulk_read; ctx->no_chk_data_crc = c->no_chk_data_crc; ctx->default_compr = c->default_compr; ctx->assert_action = c->assert_action; } fc->ops = &ubifs_context_ops; fc->fs_private = ctx; return 0; } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .init_fs_context = ubifs_init_fs_context, .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. */ static void inode_slab_ctor(void *obj) { struct ubifs_inode *ui = obj; inode_init_once(&ui->vfs_inode); } static int __init ubifs_init(void) { int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); /* Make sure node sizes are 8-byte aligned */ BUILD_BUG_ON(UBIFS_CH_SZ & 7); BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); BUILD_BUG_ON(MIN_WRITE_SZ & 7); /* Check min. node size */ BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); /* Defined node sizes */ BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); /* * We use 2 bit wide bit-fields to store compression type, which should * be amended if more compressors are added. The bit-fields are: * @compr_type in 'struct ubifs_inode', @default_compr in * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. */ BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); /* * We require that PAGE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_SIZE < UBIFS_BLOCK_SIZE) { pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", current->pid, (unsigned int)PAGE_SIZE); return -EINVAL; } ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); if (!ubifs_shrinker_info) goto out_slab; ubifs_shrinker_info->count_objects = ubifs_shrink_count; ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; shrinker_register(ubifs_shrinker_info); err = ubifs_compressors_init(); if (err) goto out_shrinker; dbg_debugfs_init(); err = ubifs_sysfs_init(); if (err) goto out_dbg; err = register_filesystem(&ubifs_fs_type); if (err) { pr_err("UBIFS error (pid %d): cannot register file system, error %d", current->pid, err); goto out_sysfs; } return 0; out_sysfs: ubifs_sysfs_exit(); out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } /* late_initcall to let compressors initialize first */ late_initcall(ubifs_init); static void __exit ubifs_exit(void) { WARN_ON(!list_empty(&ubifs_infos)); WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ubifs_inode_slab); unregister_filesystem(&ubifs_fs_type); } module_exit(ubifs_exit); MODULE_LICENSE("GPL"); MODULE_VERSION(__stringify(UBIFS_VERSION)); MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); MODULE_DESCRIPTION("UBIFS - UBI File System");
581 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); void ksm_add_vma(struct vm_area_struct *vma); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern atomic_long_t ksm_zero_pages; static inline void ksm_map_zero_page(struct mm_struct *mm) { atomic_long_inc(&ksm_zero_pages); atomic_long_inc(&mm->ksm_zero_pages); } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { atomic_long_dec(&ksm_zero_pages); atomic_long_dec(&mm->ksm_zero_pages); } } static inline long mm_ksm_zero_pages(struct mm_struct *mm) { return atomic_long_read(&mm->ksm_zero_pages); } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) { } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline int ksm_execve(struct mm_struct *mm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } static inline void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { return folio; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
9006 9014 9035 9029 7190 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0-only /* * Access kernel or user memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <asm/tlb.h> bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return true; } /* * The below only uses kmsan_check_memory() to ensure uninitialized kernel * memory isn't leaked. */ #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __get_kernel_nofault(dst, src, type, err_label); \ kmsan_check_memory(src, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; if (!copy_from_kernel_nofault_allowed(src, size)) return -ERANGE; pagefault_disable(); if (!(align & 7)) copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ __put_kernel_nofault(dst, src, type, err_label); \ instrument_write(dst, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { unsigned long align = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) align = (unsigned long)dst | (unsigned long)src; pagefault_disable(); if (!(align & 7)) copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); if (!(align & 3)) copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); if (!(align & 1)) copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); pagefault_enable(); return 0; Efault: pagefault_enable(); return -EFAULT; } EXPORT_SYMBOL_GPL(copy_to_kernel_nofault); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { const void *src = unsafe_addr; if (unlikely(count <= 0)) return 0; if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) return -ERANGE; pagefault_disable(); do { __get_kernel_nofault(dst, src, u8, Efault); dst++; src++; } while (dst[-1] && src - unsafe_addr < count); pagefault_enable(); dst[-1] = '\0'; return src - unsafe_addr; Efault: pagefault_enable(); dst[0] = '\0'; return -EFAULT; } /** * copy_from_user_nofault(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk * * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; if (!__access_ok(src, size)) return ret; if (!nmi_uaccess_okay()) return ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_from_user_nofault); /** * copy_to_user_nofault(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } if (ret) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe user address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe user address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { long ret; if (unlikely(count <= 0)) return 0; pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); if (ret >= count) { ret = count; dst[ret - 1] = '\0'; } else if (ret > 0) { ret++; } return ret; } /** * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * * Get the size of a NUL-terminated string in user space without pagefault. * * Returns the size of the string INCLUDING the terminating NUL. * * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { int ret; pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); return ret; } void __copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } EXPORT_SYMBOL(__copy_overflow);
2520 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
4 1 3 221 13 223 3 9 8 1 156 4 156 2 2 166 167 167 7 164 8 162 4 4 8 166 7 7 164 164 156 156 222 166 159 223 216 223 223 90 162 127 155 33 222 190 177 10 22 163 87 223 203 177 163 4 164 4 163 164 164 4 164 163 8 163 163 164 163 8 164 9 9 4 1 5 1 13 13 1 14 1 13 13 1 13 1 12 14 3 11 14 14 1 1 13 17 17 14 3 14 14 14 14 17 17 17 25 8 3 14 14 1 1 13 14 85 13 74 85 85 85 25 25 64 85 85 85 85 44 44 151 152 152 152 263 263 3 2 1 1 261 1 114 1 361 360 361 369 366 369 368 366 366 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 // SPDX-License-Identifier: GPL-2.0-or-later /* * inode.c * * vfs' aops, fops, dops and iops * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include <asm/byteorder.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dir.h" #include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "file.h" #include "heartbeat.h" #include "inode.h" #include "journal.h" #include "namei.h" #include "suballoc.h" #include "super.h" #include "symlink.h" #include "sysfile.h" #include "uptodate.h" #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" #include "filecheck.h" #include "buffer_head_io.h" struct ocfs2_find_inode_args { u64 fi_blkno; unsigned long fi_ino; unsigned int fi_flags; unsigned int fi_sysfile_type; }; static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); static int ocfs2_find_actor(struct inode *inode, void *opaque); static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags, int type); static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, struct buffer_head *bh); static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, struct buffer_head *bh); void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; inode->i_flags &= ~(S_IMMUTABLE | S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); if (flags & OCFS2_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; if (flags & OCFS2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & OCFS2_APPEND_FL) inode->i_flags |= S_APPEND; if (flags & OCFS2_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & OCFS2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; } /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) { unsigned int flags = oi->vfs_inode.i_flags; oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); if (flags & S_SYNC) oi->ip_attr |= OCFS2_SYNC_FL; if (flags & S_APPEND) oi->ip_attr |= OCFS2_APPEND_FL; if (flags & S_IMMUTABLE) oi->ip_attr |= OCFS2_IMMUTABLE_FL; if (flags & S_NOATIME) oi->ip_attr |= OCFS2_NOATIME_FL; if (flags & S_DIRSYNC) oi->ip_attr |= OCFS2_DIRSYNC_FL; } struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) { struct ocfs2_find_inode_args args; args.fi_blkno = blkno; args.fi_flags = 0; args.fi_ino = ino_from_blkno(sb, blkno); args.fi_sysfile_type = 0; return ilookup5(sb, blkno, ocfs2_find_actor, &args); } struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); /* Ok. By now we've either got the offsets passed to us by the * caller, or we just pulled them off the bh. Lets do some * sanity checks to make sure they're OK. */ if (blkno == 0) { inode = ERR_PTR(-EINVAL); mlog_errno(PTR_ERR(inode)); goto bail; } args.fi_blkno = blkno; args.fi_flags = flags; args.fi_ino = ino_from_blkno(sb, blkno); args.fi_sysfile_type = sysfile_type; inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, ocfs2_init_locked_inode, &args); /* inode was *not* in the inode cache. 2.6.x requires * us to do our own read_inode call and unlock it * afterwards. */ if (inode == NULL) { inode = ERR_PTR(-ENOMEM); mlog_errno(PTR_ERR(inode)); goto bail; } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(rc); goto bail; } /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); oi->i_sync_tid = tid; oi->i_datasync_tid = tid; } bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, (unsigned long long)OCFS2_I(inode)->ip_blkno); } return inode; } static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) { /* inodes flagged with other stuff in id2 */ if (di->i_flags & (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL | OCFS2_DEALLOC_FL)) return 0; /* i_flags doesn't indicate when id2 is a fast symlink */ if (S_ISLNK(di->i_mode) && di->i_size && di->i_clusters == 0) return 0; if (di->i_dyn_features & OCFS2_INLINE_DATA_FL) return 0; return 1; } /* * here's how inodes get read from disk: * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR * found? : return the in-memory inode * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE */ static int ocfs2_find_actor(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); int ret = 0; args = opaque; mlog_bug_on_msg(!inode, "No inode in find actor!\n"); trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); if (oi->ip_blkno != args->fi_blkno) goto bail; ret = 1; bail: return ret; } /* * initialize the new inode, but don't do anything that would cause * us to sleep. * return 0 on success, 1 on failure */ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; if (args->fi_sysfile_type != 0) lockdep_set_class(&inode->i_rwsem, &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_quota_ip_alloc_sem_key); else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); return 0; } void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int create_ino) { struct super_block *sb; struct ocfs2_super *osb; int use_plocks = 1; sb = inode->i_sb; osb = OCFS2_SB(sb); if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) use_plocks = 0; /* * These have all been checked by ocfs2_read_inode_block() or set * by ocfs2_mknod_locked(), so a failure is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode cannot create a superblock inode today. change if that is needed. */ BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); inode_set_iversion(inode, 1); inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); i_uid_write(inode, le32_to_cpu(fe->i_uid)); i_gid_write(inode, le32_to_cpu(fe->i_gid)); /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { inode->i_blocks = 0; inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; } else { inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } inode_set_atime(inode, le64_to_cpu(fe->i_atime), le32_to_cpu(fe->i_atime_nsec)); inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, "ip_blkno %llu != i_blkno %llu!\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); set_nlink(inode, ocfs2_read_links_count(fe)); trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_flags)); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; inode->i_flags |= S_NOQUOTA; } if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { inode->i_flags |= S_NOQUOTA; } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { /* we can't actually hit this as read_inode can't * handle superblocks today ;-) */ BUG(); } switch (inode->i_mode & S_IFMT) { case S_IFREG: if (use_plocks) inode->i_fop = &ocfs2_fops; else inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; if (use_plocks) inode->i_fop = &ocfs2_dops; else inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); OCFS2_I(inode)->ip_dir_lock_gen = 1; break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: inode->i_op = &ocfs2_special_file_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } if (create_ino) { inode->i_ino = ino_from_blkno(inode->i_sb, le64_to_cpu(fe->i_blkno)); /* * If we ever want to create system files from kernel, * the generation argument to * ocfs2_inode_lock_res_init() will have to change. */ BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, 0, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); } ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, OCFS2_LOCK_TYPE_RW, inode->i_generation, inode); ocfs2_set_inode_flags(inode); OCFS2_I(inode)->ip_last_used_slot = 0; OCFS2_I(inode)->ip_last_used_group = 0; if (S_ISDIR(inode->i_mode)) ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, OCFS2_RESV_FLAG_DIR); } static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args) { struct super_block *sb; struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; sb = inode->i_sb; osb = OCFS2_SB(sb); /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. * * Generally, OCFS2 never trusts the contents of an inode * unless it's holding a cluster lock, so taking it here isn't * a correctness issue as much as it is a performance * improvement. * * There are three times when taking the lock is not a good idea: * * 1) During startup, before we have initialized the DLM. * * 2) If we are reading certain system files which never get * cluster locks (local alloc, truncate log). * * 3) If the process doing the iget() is responsible for * orphan dir recovery. We're holding the orphan dir lock and * can get into a deadlock with another process on another * node in ->delete_inode(). * * #1 and #2 can be simply solved by never taking the lock * here for system files (which are the only type we read * during mount). It's a heavier approach, but our main * concern is user-accessible files anyway. * * #3 works itself out because we'll eventually take the * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); trace_ocfs2_read_locked_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); /* * To maintain backwards compatibility with older versions of * ocfs2-tools, we still store the generation value for system * files. The only ones that actually matter to userspace are * the journals, but it's easier and inexpensive to just flag * all system files similarly. */ if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) generation = osb->fs_generation; ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, generation, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); if (can_lock) { status = ocfs2_open_lock(inode); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } } if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { status = ocfs2_try_open_lock(inode, 0); if (status) { make_bad_inode(inode); return status; } } if (can_lock) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 0); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 1); else status = ocfs2_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ if (!status && !buffer_jbd(bh)) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_validate_inode_block( osb->sb, bh); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_repair_inode_block( osb->sb, bh); else status = ocfs2_validate_inode_block( osb->sb, bh); } } if (status < 0) { mlog_errno(status); goto bail; } status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; /* * This is a code bug. Right now the caller needs to * understand whether it is asking for a system file inode or * not so the proper lock names can be built. */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); if (buffer_dirty(bh) && !buffer_jbd(bh)) { if (can_lock) { ocfs2_inode_unlock(inode, lock_level); lock_level = 1; ocfs2_inode_lock(inode, NULL, lock_level); } status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); if (status < 0) { mlog_errno(status); goto bail; } } status = 0; bail: if (can_lock) ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); brelse(bh); return status; } void ocfs2_sync_blockdev(struct super_block *sb) { sync_blockdev(sb->s_bdev); } static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh) { int status = 0; struct ocfs2_dinode *fe; handle_t *handle = NULL; fe = (struct ocfs2_dinode *) fe_bh->b_data; /* * This check will also skip truncate of inodes with inline * data and fast symlinks. */ if (fe->i_clusters) { if (ocfs2_should_order_data(inode)) ocfs2_begin_ordered_truncate(inode, 0); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; mlog_errno(status); goto out; } status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out; } i_size_write(inode, 0); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { mlog_errno(status); goto out; } ocfs2_commit_trans(osb, handle); handle = NULL; status = ocfs2_commit_truncate(osb, inode, fe_bh); if (status < 0) mlog_errno(status); } out: if (handle) ocfs2_commit_trans(osb, handle); return status; } static int ocfs2_remove_inode(struct inode *inode, struct buffer_head *di_bh, struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh) { int status; struct inode *inode_alloc_inode = NULL; struct buffer_head *inode_alloc_bh = NULL; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; inode_alloc_inode = ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { status = -ENOENT; mlog_errno(status); goto bail; } inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; } handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_unlock; } if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; } } /* set the inodes dtime */ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail_commit; } di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); if (status < 0) mlog_errno(status); bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); return status; } /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, int slot) { int ret = 0; spin_lock(&osb->osb_lock); if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { ret = -EDEADLK; goto out; } /* This signals to the orphan recovery process that it should * wait for us to handle the wipe. */ osb->osb_orphan_wipes[slot]++; out: spin_unlock(&osb->osb_lock); trace_ocfs2_check_orphan_recovery_state(slot, ret); return ret; } static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, int slot) { spin_lock(&osb->osb_lock); osb->osb_orphan_wipes[slot]--; spin_unlock(&osb->osb_lock); wake_up(&osb->osb_wipe_event); } static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { int status, orphaned_slot = -1; struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { orphaned_slot = le16_to_cpu(di->i_orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); if (status) return status; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); goto bail; } /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; } } /* we do this while holding the orphan dir lock because we * don't want recovery being run from another node to try an * inode delete underneath us -- this will result in two nodes * truncating the same file! */ status = ocfs2_truncate_for_delete(osb, inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } /* Remove any dir index tree */ if (S_ISDIR(inode->i_mode)) { status = ocfs2_dx_dir_truncate(inode, di_bh); if (status) { mlog_errno(status); goto bail_unlock_dir; } } /*Free extended attribute resources associated with this inode.*/ status = ocfs2_xattr_remove(inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } status = ocfs2_remove_refcount_tree(inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) mlog_errno(status); bail_unlock_dir: if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) return status; ocfs2_inode_unlock(orphan_dir_inode, 1); inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); ocfs2_signal_wipe_completion(osb, orphaned_slot); return status; } /* There is a series of simple checks that should be done before a * trylock is even considered. Encapsulate those in this function. */ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) { int ret = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, (unsigned long long)oi->ip_blkno, oi->ip_flags); /* We shouldn't be getting here for the root directory * inode.. */ if (inode == osb->root_inode) { mlog(ML_ERROR, "Skipping delete of root inode.\n"); goto bail; } /* * If we're coming from downconvert_thread we can't go into our own * voting [hello, deadlock city!] so we cannot delete the inode. But * since we dropped last inode ref when downconverting dentry lock, * we cannot have the file open and thus the node doing unlink will * take care of deleting the inode. */ if (current == osb->dc_task) goto bail; spin_lock(&oi->ip_lock); /* OCFS2 *never* deletes system files. This should technically * never get here as system file inodes should always have a * positive link count. */ if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { mlog(ML_ERROR, "Skipping delete of system file %llu\n", (unsigned long long)oi->ip_blkno); goto bail_unlock; } ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); bail: return ret; } /* Query the cluster to determine whether we should wipe an inode from * disk or not. * * Requires the inode to have the cluster lock. */ static int ocfs2_query_inode_wipe(struct inode *inode, struct buffer_head *di_bh, int *wipe) { int status = 0, reason = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di; *wipe = 0; trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, inode->i_nlink); /* While we were waiting for the cluster lock in * ocfs2_delete_inode, another node might have asked to delete * the inode. Recheck our flags to catch this. */ if (!ocfs2_inode_is_valid_to_delete(inode)) { reason = 1; goto bail; } /* Now that we have an up to date inode, we can double check * the link count. */ if (inode->i_nlink) goto bail; /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { /* * Inodes in the orphan dir must have ORPHANED_FL. The only * inodes that come back out of the orphan dir are reflink * targets. A reflink target may be moved out of the orphan * dir between the time we scan the directory and the time we * process it. This would lead to HAS_REFCOUNT_FL being set but * ORPHANED_FL not. */ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { reason = 2; goto bail; } /* for lack of a better error? */ status = -EEXIST; mlog(ML_ERROR, "Inode %llu (on-disk %llu) not orphaned! " "Disk flags 0x%x, inode flags 0x%x\n", (unsigned long long)oi->ip_blkno, (unsigned long long)le64_to_cpu(di->i_blkno), le32_to_cpu(di->i_flags), oi->ip_flags); goto bail; } /* has someone already deleted us?! baaad... */ if (di->i_dtime) { status = -EEXIST; mlog_errno(status); goto bail; } /* * This is how ocfs2 determines whether an inode is still live * within the cluster. Every node takes a shared read lock on * the inode open lock in ocfs2_read_locked_inode(). When we * get to ->delete_inode(), each node tries to convert it's * lock to an exclusive. Trylocks are serialized by the inode * meta data lock. If the upconvert succeeds, we know the inode * is no longer live and can be deleted. * * Though we call this with the meta data lock held, the * trylock keeps us from ABBA deadlock. */ status = ocfs2_try_open_lock(inode, 1); if (status == -EAGAIN) { status = 0; reason = 3; goto bail; } if (status < 0) { mlog_errno(status); goto bail; } *wipe = 1; trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); bail: trace_ocfs2_query_inode_wipe_end(status, reason); return status; } /* Support function for ocfs2_delete_inode. Will help us keep the * inode data in a consistent state for clear_inode. Always truncates * pages, optionally sync's them first. */ static void ocfs2_cleanup_delete_inode(struct inode *inode, int sync_data) { trace_ocfs2_cleanup_delete_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) { int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, is_bad_inode(inode)); /* When we fail in read_inode() we mark inode as bad. The second test * catches the case when inode allocation fails before allocating * a block for inode. */ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most * likely be a no-op anyway) */ ocfs2_cleanup_delete_inode(inode, 0); goto bail; } dquot_initialize(inode); /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned * forever. */ ocfs2_block_signals(&oldset); /* * Synchronize us against ocfs2_get_dentry. We take this in * shared mode so that all nodes can still concurrently * process deletes. */ status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); if (status < 0) { mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); ocfs2_cleanup_delete_inode(inode, 0); goto bail_unblock; } /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to * serialize delete_inode on multiple nodes. * * Even though we might be doing a truncate, we don't take the * allocation lock here as it won't be needed - nobody will * have the file open. */ status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); ocfs2_cleanup_delete_inode(inode, 0); goto bail_unlock_nfs_sync; } di = (struct ocfs2_dinode *)di_bh->b_data; /* Skip inode deletion and wait for dio orphan entry recovered * first */ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ocfs2_cleanup_delete_inode(inode, 0); goto bail_unlock_inode; } /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); if (!wipe || status < 0) { /* Error and remote inode busy both mean we won't be * removing the inode, so they take almost the same * path. */ if (status < 0) mlog_errno(status); /* Someone in the cluster has disallowed a wipe of * this inode, or it was never completely * orphaned. Write out the pages and exit now. */ ocfs2_cleanup_delete_inode(inode, 1); goto bail_unlock_inode; } ocfs2_cleanup_delete_inode(inode, 0); status = ocfs2_wipe_inode(inode, di_bh); if (status < 0) { if (status != -EDEADLK) mlog_errno(status); goto bail_unlock_inode; } /* * Mark the inode as successfully deleted. * * This is important for ocfs2_clear_inode() as it will check * this flag and skip any checkpointing work * * ocfs2_stuff_meta_lvb() also uses this flag to invalidate * the LVB for other nodes. */ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; bail_unlock_inode: ocfs2_inode_unlock(inode, 1); brelse(di_bh); bail_unlock_nfs_sync: ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); bail_unblock: ocfs2_unblock_signals(&oldset); bail: return; } static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); /* To prevent remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), "Clear inode of %llu, inode has unwritten extents\n", (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) mlog_errno(status); ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_exit(INODE_CACHE(inode)); mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", (unsigned long long)oi->ip_blkno, INODE_CACHE(inode)->ci_num_cached); mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), "Clear inode of %llu, inode is locked\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), "Clear inode of %llu, io_mutex is locked\n", (unsigned long long)oi->ip_blkno); mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 * kernel 1, world 0 */ mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), "Clear inode of %llu, alloc_sem is locked\n", (unsigned long long)oi->ip_blkno); up_write(&oi->ip_alloc_sem); mlog_bug_on_msg(oi->ip_open_count, "Clear inode of %llu has open count %d\n", (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ oi->ip_flags = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; /* * ip_jinode is used to track txns against this inode. We ensure that * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } void ocfs2_evict_inode(struct inode *inode) { if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } /* Called under inode_lock, with no more references on the * struct inode, so it's safe here to check the flags field * and to manipulate i_nlink without any other locks. */ int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); assert_spin_locked(&inode->i_lock); inode->i_state |= I_WILL_FREE; spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; return 1; } /* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) { struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); if (!inode) { status = -ENOENT; goto bail; } spin_lock(&OCFS2_I(inode)->ip_lock); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&OCFS2_I(inode)->ip_lock); status = -ENOENT; goto bail; } spin_unlock(&OCFS2_I(inode)->ip_lock); /* Let ocfs2_inode_lock do the work of updating our struct * inode for us. */ status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto bail; } ocfs2_inode_unlock(inode, 0); bail: return status; } /* * Updates a disk inode from a * struct inode. * Only takes ip_lock. */ int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int status; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; } spin_lock(&OCFS2_I(inode)->ip_lock); fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); ocfs2_get_inode_flags(OCFS2_I(inode)); fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); spin_unlock(&OCFS2_I(inode)->ip_lock); fe->i_size = cpu_to_le64(i_size_read(inode)); ocfs2_set_links_count(fe, inode->i_nlink); fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } /* * * Updates a struct inode from a disk inode. * does no i/o, only takes ip_lock. */ void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe) { spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); set_nlink(inode, ocfs2_read_links_count(fe)); i_uid_write(inode, le32_to_cpu(fe->i_uid)); i_gid_write(inode, le32_to_cpu(fe->i_gid)); inode->i_mode = le16_to_cpu(fe->i_mode); if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); inode_set_atime(inode, le64_to_cpu(fe->i_atime), le32_to_cpu(fe->i_atime_nsec)); inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } int ocfs2_validate_inode_block(struct super_block *sb, struct buffer_head *bh) { int rc; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { mlog(ML_ERROR, "Checksum failed for dinode %llu\n", (unsigned long long)bh->b_blocknr); goto bail; } /* * Errors after here are fatal. */ rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", (unsigned long long)bh->b_blocknr, 7, di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { rc = ocfs2_error(sb, "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { rc = ocfs2_error(sb, "Invalid dinode #%llu: fs_generation is %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); goto bail; } rc = 0; bail: return rc; } static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, struct buffer_head *bh) { int rc = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; trace_ocfs2_filecheck_validate_inode_block( (unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * Call ocfs2_validate_meta_ecc() first since it has ecc repair * function, but we should not return error immediately when ecc * validation fails, because the reason is quite likely the invalid * inode number inputted. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { mlog(ML_ERROR, "Filecheck: checksum failed for dinode %llu\n", (unsigned long long)bh->b_blocknr); rc = -OCFS2_FILECHECK_ERR_BLOCKECC; } if (!OCFS2_IS_VALID_DINODE(di)) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: signature = %.*s\n", (unsigned long long)bh->b_blocknr, 7, di->i_signature); rc = -OCFS2_FILECHECK_ERR_INVALIDINO; goto bail; } else if (rc) goto bail; if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); rc = -OCFS2_FILECHECK_ERR_BLOCKNO; goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " "not set\n", (unsigned long long)bh->b_blocknr); rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: fs_generation is %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); rc = -OCFS2_FILECHECK_ERR_GENERATION; } bail: return rc; } static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, struct buffer_head *bh) { int changed = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; if (!ocfs2_filecheck_validate_inode_block(sb, bh)) return 0; trace_ocfs2_filecheck_repair_inode_block( (unsigned long long)bh->b_blocknr); if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || ocfs2_is_soft_readonly(OCFS2_SB(sb))) { mlog(ML_ERROR, "Filecheck: cannot repair dinode #%llu " "on readonly filesystem\n", (unsigned long long)bh->b_blocknr); return -OCFS2_FILECHECK_ERR_READONLY; } if (buffer_jbd(bh)) { mlog(ML_ERROR, "Filecheck: cannot repair dinode #%llu, " "its buffer is in jbd\n", (unsigned long long)bh->b_blocknr); return -OCFS2_FILECHECK_ERR_INJBD; } if (!OCFS2_IS_VALID_DINODE(di)) { /* Cannot fix invalid inode block */ return -OCFS2_FILECHECK_ERR_INVALIDINO; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { /* Cannot just add VALID_FL flag back as a fix, * need more things to check here. */ return -OCFS2_FILECHECK_ERR_VALIDFLAG; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { di->i_blkno = cpu_to_le64(bh->b_blocknr); changed = 1; mlog(ML_ERROR, "Filecheck: reset dinode #%llu: i_blkno to %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); changed = 1; mlog(ML_ERROR, "Filecheck: reset dinode #%llu: fs_generation to %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); } if (ocfs2_dinode_has_extents(di) && le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) { di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count; changed = 1; mlog(ML_ERROR, "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(di->id2.i_list.l_next_free_rec)); } if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); mark_buffer_dirty(bh); mlog(ML_ERROR, "Filecheck: reset dinode #%llu: compute meta ecc\n", (unsigned long long)bh->b_blocknr); } return 0; } static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags, int type) { int rc; struct buffer_head *tmp = *bh; if (!type) /* Check inode block */ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_filecheck_validate_inode_block); else /* Repair inode block */ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_filecheck_repair_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { int rc; struct buffer_head *tmp = *bh; rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_validate_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) { return ocfs2_read_inode_block_full(inode, bh, 0); } static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); return oi->ip_blkno; } static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); return oi->vfs_inode.i_sb; } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) __acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); spin_lock(&oi->ip_lock); } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) __releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); spin_unlock(&oi->ip_lock); } static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); mutex_lock(&oi->ip_io_mutex); } static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); mutex_unlock(&oi->ip_io_mutex); } const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { .co_owner = ocfs2_inode_cache_owner, .co_get_super = ocfs2_inode_cache_get_super, .co_cache_lock = ocfs2_inode_cache_lock, .co_cache_unlock = ocfs2_inode_cache_unlock, .co_io_lock = ocfs2_inode_cache_io_lock, .co_io_unlock = ocfs2_inode_cache_io_unlock, };
41 20 21 59 59 59 52 363 365 21 4 365 2 193 193 2 344 27 27 21 59 203 202 5 197 1 351 204 170 172 21 203 1 203 230 345 345 23 173 172 173 170 172 24 170 169 1 173 172 48 48 48 48 48 48 48 1 1 1 48 48 48 48 48 48 48 48 48 59 59 59 59 74 74 74 17 73 53 41 74 73 17 70 71 35 66 42 66 28 28 53 52 53 11 53 57 57 55 3 3 53 36 20 21 2 2 10 10 10 226 23 191 3 2 189 162 191 191 152 18 187 192 192 191 25 20 2 185 34 86 36 32 200 200 200 111 200 183 183 180 62 17 17 16 17 16 34 34 53 70 3 70 16 34 53 2 53 50 50 48 48 41 9 46 3 46 1 48 65 65 65 48 48 48 48 47 48 42 8 48 45 10 8 40 48 48 48 48 75 1 68 2 5 74 65 6 6 1 70 48 5 48 48 73 185 40 72 10 10 122 4 118 53 19 34 7 40 40 185 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/bio.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/f2fs_fs.h> #include <linux/pagevec.h> #include <linux/swap.h> #include <linux/kthread.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "iostat.h" #include <trace/events/f2fs.h> #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); if (!end_io) f2fs_flush_merged_writes(sbi); f2fs_handle_critical_error(sbi, reason); } /* * We guarantee no failure on the returned page. */ struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } f2fs_wait_on_page_writeback(page, META, true, true); if (!PageUptodate(page)) SetPageUptodate(page); return page; } static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; struct f2fs_io_info fio = { .sbi = sbi, .type = META, .op = REQ_OP_READ, .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, .is_por = !is_meta ? 1 : 0, }; int err; if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } if (PageUptodate(page)) goto out; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) { f2fs_put_page(page, 1); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } if (unlikely(!PageUptodate(page))) { f2fs_handle_page_eio(sbi, page_folio(page), META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } out: return page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, true); } struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; retry: page = __get_meta_page(sbi, index, true); if (IS_ERR(page)) { if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } /* for POR only */ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { struct seg_entry *se; unsigned int segno, offset; bool exist; if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); /* skip data, if we already have an error in checkpoint. */ if (unlikely(f2fs_cp_error(sbi))) return exist; if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) || (!exist && type == DATA_GENERIC_ENHANCE)) goto out_err; if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE) goto out_handle; return exist; out_err: f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); dump_stack(); out_handle: f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return exist; } static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { case META_NAT: break; case META_SIT: if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) goto check_only; break; case META_SSA: if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || blkaddr < SM_I(sbi)->ssa_blkaddr)) goto check_only; break; case META_CP: if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || blkaddr < __start_cp_addr(sbi))) goto check_only; break; case META_POR: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) goto check_only; break; case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { /* Skip to emit an error message. */ if (unlikely(f2fs_cp_error(sbi))) return false; f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); dump_stack(); goto err; } else { return __is_bitmap_valid(sbi, blkaddr, type); } break; case META_GENERIC: if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || blkaddr >= MAIN_BLKADDR(sbi))) goto err; break; default: BUG(); } return true; err: f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); check_only: return false; } bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY)) return false; return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); } bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); } /* * Readahead CP/NAT/SIT/SSA/POR pages */ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { struct page *page; block_t blkno = start; struct f2fs_io_info fio = { .sbi = sbi, .type = META, .op = REQ_OP_READ, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, .in_list = 0, .is_por = (type == META_POR) ? 1 : 0, }; struct blk_plug plug; int err; if (unlikely(type == META_POR)) fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; switch (type) { case META_NAT: if (unlikely(blkno >= NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: if (unlikely(blkno >= TOTAL_SEGS(sbi))) goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); break; case META_SSA: case META_CP: case META_POR: fio.new_blkaddr = blkno; break; default: BUG(); } page = f2fs_grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { f2fs_put_page(page, 1); continue; } fio.page = page; err = f2fs_submit_page_bio(&fio); f2fs_put_page(page, err ? 1 : 0); if (!err) f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); } out: blk_finish_plug(&plug); return blkno - start; } void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks) { struct page *page; bool readahead = false; if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) return; page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, struct writeback_control *wbc, enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct folio *folio = page_folio(page); trace_f2fs_writepage(folio, META); if (unlikely(f2fs_cp_error(sbi))) { if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { folio_clear_uptodate(folio); dec_page_count(sbi, F2FS_DIRTY_META); folio_unlock(folio); return 0; } goto redirty_out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && folio->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; f2fs_do_write_meta_page(sbi, folio, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); folio_unlock(folio); if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_write(sbi, META); return 0; redirty_out: redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { return __f2fs_write_meta_page(page, wbc, FS_META_IO); } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; /* collect a number of dirty meta pages and write together */ if (wbc->sync_mode != WB_SYNC_ALL && get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); trace_f2fs_writepages(mapping->host, wbc, META); return 0; } long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; struct folio_batch fbatch; long nwritten = 0; int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; folio_batch_init(&fbatch); blk_start_plug(&plug); while ((nr_folios = filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch))) { int i; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; if (nr_to_write != LONG_MAX && i != 0 && folio->index != prev + folio_nr_pages(fbatch.folios[i-1])) { folio_batch_release(&fbatch); goto stop; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } f2fs_wait_on_page_writeback(&folio->page, META, true, true); if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; if (__f2fs_write_meta_page(&folio->page, &wbc, io_type)) { folio_unlock(folio); break; } nwritten += folio_nr_pages(folio); prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } folio_batch_release(&fbatch); cond_resched(); } stop: if (nwritten) f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); return nwritten; } static bool f2fs_dirty_meta_folio(struct address_space *mapping, struct folio *folio) { trace_f2fs_set_page_dirty(folio, META); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; } return false; } const struct address_space_operations f2fs_meta_aops = { .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e = NULL, *new = NULL; if (type == FLUSH_INO) { rcu_read_lock(); e = radix_tree_lookup(&im->ino_root, ino); rcu_read_unlock(); } retry: if (!e) new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { if (!new) { spin_unlock(&im->ino_lock); radix_tree_preload_end(); goto retry; } e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; list_add_tail(&e->list, &im->ino_list); if (type != ORPHAN_INO) im->ino_num++; } if (type == FLUSH_INO) f2fs_set_bit(devidx, (char *)&e->dirty_device); spin_unlock(&im->ino_lock); radix_tree_preload_end(); if (new && e != new) kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (e) { list_del(&e->list); radix_tree_delete(&im->ino_root, ino); im->ino_num--; spin_unlock(&im->ino_lock); kmem_cache_free(ino_entry_slab, e); return; } spin_unlock(&im->ino_lock); } void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, 0, type); } void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); } /* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); spin_unlock(&im->ino_lock); return e ? true : false; } void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); list_for_each_entry_safe(e, tmp, &im->ino_list, list) { list_del(&e->list); radix_tree_delete(&im->ino_root, e->ino); kmem_cache_free(ino_entry_slab, e); im->ino_num--; } spin_unlock(&im->ino_lock); } } void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { __add_ino_entry(sbi, ino, devidx, type); } bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e; bool is_dirty = false; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) is_dirty = true; spin_unlock(&im->ino_lock); return is_dirty; } int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; spin_lock(&im->ino_lock); if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); return -ENOSPC; } if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else im->ino_num++; spin_unlock(&im->ino_lock); return err; } void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; spin_lock(&im->ino_lock); f2fs_bug_on(sbi, im->ino_num == 0); im->ino_num--; spin_unlock(&im->ino_lock); } void f2fs_add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); f2fs_update_inode_page(inode); } void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ __remove_ino_entry(sbi, ino, ORPHAN_INO); } static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry * to orphan inode. */ f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); return PTR_ERR(inode); } err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; } clear_nlink(inode); /* truncate all the data during iput */ iput(inode); err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { err = -EIO; goto err_out; } return 0; err_out: set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.", __func__, ino); return err; } int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; if (f2fs_hw_is_readonly(sbi)) { f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); return 0; } if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "orphan cleanup on readonly fs"); start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page; struct f2fs_orphan_block *orphan_blk; page = f2fs_get_meta_page(sbi, start_blk + i); if (IS_ERR(page)) { err = PTR_ERR(page); goto out; } orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); out: set_sbi_flag(sbi, SBI_IS_RECOVERED); return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; unsigned short index = 1; unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; struct inode_management *im = &sbi->im[ORPHAN_INO]; orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); /* * we don't need to do spin_lock(&im->ino_lock) here, since all the * orphan inode operations are covered under f2fs_lock_op(). * And, spin_lock should be avoided due to page operations below. */ head = &im->ino_list; /* loop for each orphan inode entry and write them in journal block */ list_for_each_entry(orphan, head, list) { if (!page) { page = f2fs_grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); } orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* * an orphan block is full of 1020 entries, * then we need to flush current orphan blocks * and bring another one in memory */ orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); set_page_dirty(page); f2fs_put_page(page, 1); index++; nentries = 0; page = NULL; } } if (page) { orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); set_page_dirty(page); f2fs_put_page(page, 1); } } static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi, struct f2fs_checkpoint *ckpt) { unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset); __u32 chksum; chksum = f2fs_crc32(sbi, ckpt, chksum_ofs); if (chksum_ofs < CP_CHKSUM_OFFSET) { chksum_ofs += sizeof(chksum); chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs, F2FS_BLKSIZE - chksum_ofs); } return chksum; } static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, struct f2fs_checkpoint **cp_block, struct page **cp_page, unsigned long long *version) { size_t crc_offset = 0; __u32 crc; *cp_page = f2fs_get_meta_page(sbi, cp_addr); if (IS_ERR(*cp_page)) return PTR_ERR(*cp_page); *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset < CP_MIN_CHKSUM_OFFSET || crc_offset > CP_CHKSUM_OFFSET) { f2fs_put_page(*cp_page, 1); f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } crc = f2fs_checkpoint_chksum(sbi, *cp_block); if (crc != cur_cp_crc(*cp_block)) { f2fs_put_page(*cp_page, 1); f2fs_warn(sbi, "invalid crc value"); return -EINVAL; } *version = cur_cp_version(*cp_block); return 0; } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_1, version); if (err) return NULL; cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { *version = cur_version; f2fs_put_page(cp_page_2, 1); return cp_page_1; } f2fs_put_page(cp_page_2, 1); invalid_cp: f2fs_put_page(cp_page_1, 1); return NULL; } int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; struct page *cp1, *cp2, *cur_page; unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; unsigned int cp_blks = 1 + __cp_payload(sbi); block_t cp_blk_no; int i; int err; sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* * Finding out valid cp block involves read both * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ cp_start_blk_no += ((unsigned long long)1) << le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { if (ver_after(cp2_version, cp1_version)) cur_page = cp2; else cur_page = cp1; } else if (cp1) { cur_page = cp1; } else if (cp2) { cur_page = cp2; } else { err = -EFSCORRUPTED; goto fail_no_cp; } cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); if (cur_page == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; /* Sanity checking of checkpoint */ if (f2fs_sanity_check_ckpt(sbi)) { err = -EFSCORRUPTED; goto free_fail_no_cp; } if (cp_blks <= 1) goto done; cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); if (cur_page == cp2) cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); if (IS_ERR(cur_page)) { err = PTR_ERR(cur_page); goto free_fail_no_cp; } sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); } done: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); return 0; free_fail_no_cp: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); fail_no_cp: kvfree(sbi->ckpt); return err; } static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; if (is_inode_flag_set(inode, flag)) return; set_inode_flag(inode, flag); list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } static void __remove_dirty_inode(struct inode *inode, enum inode_type type) { int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) return; list_del_init(&F2FS_I(inode)->dirty_list); clear_inode_flag(inode, flag); stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; spin_lock(&sbi->inode_lock[type]); if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); set_page_private_reference(&folio->page); } void f2fs_remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); } int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, bool from_cp) { struct list_head *head; struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) { trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; } spin_lock(&sbi->inode_lock[type]); head = &sbi->inode_list[type]; if (list_empty(head)) { spin_unlock(&sbi->inode_lock[type]); trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { unsigned long cur_ino = inode->i_ino; if (from_cp) F2FS_I(inode)->cp_task = current; F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); F2FS_I(inode)->wb_task = NULL; if (from_cp) F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) cond_resched(); else ino = cur_ino; } else { /* * We should submit bio, since it exists several * writebacking dentry pages in the freeing inode. */ f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; } static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) { struct list_head *head = &sbi->inode_list[DIRTY_META]; struct inode *inode; struct f2fs_inode_info *fi; s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); while (total--) { if (unlikely(f2fs_cp_error(sbi))) return -EIO; spin_lock(&sbi->inode_lock[DIRTY_META]); if (list_empty(head)) { spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { sync_inode_metadata(inode, 0); /* it's on eviction */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode_page(inode); iput(inode); } } return 0; } static void __prepare_cp_block(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); nid_t last_nid = nm_i->next_scan_nid; next_free_nid(sbi, &last_nid); ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); ckpt->next_free_nid = cpu_to_le32(last_nid); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); percpu_counter_set(&sbi->rf_node_block_count, 0); } static bool __need_flush_quota(struct f2fs_sb_info *sbi) { bool ret = false; if (!is_journalled_quota(sbi)) return false; if (!f2fs_down_write_trylock(&sbi->quota_sem)) return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) { clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); ret = true; } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } f2fs_up_write(&sbi->quota_sem); return ret; } /* * Freeze all the FS-operations for checkpoint. */ static int block_operations(struct f2fs_sb_info *sbi) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; int err = 0, cnt = 0; /* * Let's flush inline_data in dirty node pages. */ f2fs_flush_inline_data(sbi); retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { int locked; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); goto retry_flush_dents; } f2fs_unlock_all(sbi); /* only failed during mount/umount/freeze/quotactl */ locked = down_read_trylock(&sbi->sb->s_umount); f2fs_quota_sync(sbi->sb, -1); if (locked) up_read(&sbi->sb->s_umount); cond_resched(); goto retry_flush_quotas; } retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); goto retry_flush_quotas; } /* * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) return err; cond_resched(); goto retry_flush_quotas; } retry_flush_nodes: f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); return err; } cond_resched(); goto retry_flush_nodes; } /* * sbi->node_change is used only for AIO write_begin path which produces * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) { DEFINE_WAIT(wait); for (;;) { if (!get_pages(sbi, type)) break; if (unlikely(f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) break; if (type == F2FS_DIRTY_META) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); else if (type == F2FS_WB_CP_DATA) f2fs_submit_merged_write(sbi, DATA); prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; if (cpc->reason & CP_UMOUNT) { if (le32_to_cpu(ckpt->cp_pack_total_block_count) + NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && f2fs_nat_bitmap_enabled(sbi)) { f2fs_enable_nat_bits(sbi); set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Rebuild and enable nat_bits"); } } spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); if (orphan_num) __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); else __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); else __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); else __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); else __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); } static void commit_checkpoint(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct writeback_control wbc = { .for_reclaim = 0, }; /* * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; f2fs_wait_on_page_writeback(page, META, true, true); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); if (unlikely(!clear_page_dirty_for_io(page))) f2fs_bug_on(sbi, 1); /* writeout cp pack 2 page */ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); if (unlikely(err && f2fs_cp_error(sbi))) { f2fs_put_page(page, 1); return; } f2fs_bug_on(sbi, err); f2fs_put_page(page, 0); /* submit checkpoint (with barrier if NOBARRIER is not set) */ f2fs_submit_merged_write(sbi, META_FLUSH); } static inline u64 get_sectors_written(struct block_device *bdev) { return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); } u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) { if (f2fs_is_multi_device(sbi)) { u64 sectors = 0; int i; for (i = 0; i < sbi->s_ndevs; i++) sectors += get_sectors_written(FDEV(i).bdev); return sectors; } return get_sectors_written(sbi->sb->s_bdev); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE); ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno); ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff); ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type; } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA); ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno); ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff); ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type; } /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); if (__remain_node_summaries(cpc->reason)) ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks); /* update ckpt flag for checkpoint */ update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_checkpoint_chksum(sbi, ckpt); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); start_blk = __start_cp_next_addr(sbi); /* write nat bits */ if ((cpc->reason & CP_UMOUNT) && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; cp_ver |= ((__u64)crc32 << 32); *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) f2fs_update_meta_page(sbi, nm_i->nat_bits + F2FS_BLK_TO_BYTES(i), blk + i); } /* write out checkpoint buffer at block 0 */ f2fs_update_meta_page(sbi, ckpt, start_blk++); for (i = 1; i < 1 + cp_payload_blks; i++) f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, start_blk++); if (orphan_num) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; } f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ kbytes_written = sbi->kbytes_written; kbytes_written += (f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1; seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* * invalidate intermediate page cache borrowed from meta inode which are * used for migration of encrypted, verity or compressed inode's blocks. */ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || f2fs_sb_has_compression(sbi)) f2fs_bug_on(sbi, invalidate_inode_pages2_range(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1)); f2fs_release_ino_entry(sbi, false); f2fs_reset_fsync_node_info(sbi); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); spin_lock(&sbi->stat_lock); sbi->unusable_block_count = 0; spin_unlock(&sbi->stat_lock); __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is * updated during writing checkpoint. */ if (get_pages(sbi, F2FS_DIRTY_NODES) || get_pages(sbi, F2FS_DIRTY_IMETA)) set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; int err = 0; if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (cpc->reason != CP_PAUSE) return 0; f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto out; } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); err = block_operations(sbi); if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { if (!f2fs_exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; } if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); goto out; } } /* * update checkpoint pack index * Increase the version number so that * SIT entries and seg summaries are written at correct place */ ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); if (err) { f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; } f2fs_flush_sit_entries(sbi, cpc); /* save inmem log status */ f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); if (err) { f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); } else { f2fs_clear_prefree_segments(sbi, cpc); } f2fs_restore_inmem_curseg(sbi); f2fs_reinit_atgc_curseg(sbi); stat_inc_cp_count(sbi); stop: unblock_operations(sbi); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) f2fs_up_write(&sbi->cp_global_sem); return err; } void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); spin_lock_init(&im->ino_lock); INIT_LIST_HEAD(&im->ino_list); im->ino_num = 0; } sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS - NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } int __init f2fs_create_checkpoint_caches(void) { ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", sizeof(struct inode_entry)); if (!f2fs_inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } void f2fs_destroy_checkpoint_caches(void) { kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) { struct cp_control cpc = { .reason = CP_SYNC, }; int err; f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); return err; } static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct ckpt_req *req, *next; struct llist_node *dispatch_list; u64 sum_diff = 0, diff, count = 0; int ret; dispatch_list = llist_del_all(&cprc->issue_list); if (!dispatch_list) return; dispatch_list = llist_reverse_order(dispatch_list); ret = __write_checkpoint_sync(sbi); atomic_inc(&cprc->issued_ckpt); llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; complete(&req->wait); sum_diff += diff; count++; } atomic_sub(count, &cprc->queued_ckpt); atomic_add(count, &cprc->total_ckpt); spin_lock(&cprc->stat_lock); cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); if (cprc->peak_time < cprc->cur_time) cprc->peak_time = cprc->cur_time; spin_unlock(&cprc->stat_lock); } static int issue_checkpoint_thread(void *data) { struct f2fs_sb_info *sbi = data; struct ckpt_req_control *cprc = &sbi->cprc_info; wait_queue_head_t *q = &cprc->ckpt_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&cprc->issue_list)) __checkpoint_and_complete_reqs(sbi); wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&cprc->issue_list)); goto repeat; } static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, struct ckpt_req *wait_req) { struct ckpt_req_control *cprc = &sbi->cprc_info; if (!llist_empty(&cprc->issue_list)) { __checkpoint_and_complete_reqs(sbi); } else { /* already dispatched by issue_checkpoint_thread */ if (wait_req) wait_for_completion(&wait_req->wait); } } static void init_ckpt_req(struct ckpt_req *req) { memset(req, 0, sizeof(struct ckpt_req)); init_completion(&req->wait); req->queue_time = ktime_get(); } int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct ckpt_req req; struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { int ret; f2fs_down_write(&sbi->gc_lock); ret = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); return ret; } if (!cprc->f2fs_issue_ckpt) return __write_checkpoint_sync(sbi); init_ckpt_req(&req); llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); /* * update issue_list before we wake up issue_checkpoint thread, * this smp_mb() pairs with another barrier in ___wait_event(), * see more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) wake_up(&cprc->ckpt_wait_queue); if (cprc->f2fs_issue_ckpt) wait_for_completion(&req.wait); else flush_remained_ckpt_reqs(sbi, &req); return req.ret; } int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct ckpt_req_control *cprc = &sbi->cprc_info; if (cprc->f2fs_issue_ckpt) return 0; cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { int err = PTR_ERR(cprc->f2fs_issue_ckpt); cprc->f2fs_issue_ckpt = NULL; return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); return 0; } void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct task_struct *ckpt_task; if (!cprc->f2fs_issue_ckpt) return; ckpt_task = cprc->f2fs_issue_ckpt; cprc->f2fs_issue_ckpt = NULL; kthread_stop(ckpt_task); f2fs_flush_ckpt_thread(sbi); } void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; flush_remained_ckpt_reqs(sbi, NULL); /* Let's wait for the previous dispatched checkpoint. */ while (atomic_read(&cprc->queued_ckpt)) io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; atomic_set(&cprc->issued_ckpt, 0); atomic_set(&cprc->total_ckpt, 0); atomic_set(&cprc->queued_ckpt, 0); cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; init_waitqueue_head(&cprc->ckpt_wait_queue); init_llist_head(&cprc->issue_list); spin_lock_init(&cprc->stat_lock); }
1 1 758 758 759 760 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_PMD_NONE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, SCAN_PAGE_COUNT, SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static unsigned long khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged. */ unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; /** * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned * @slot: hash lookup from mm to mm_slot */ struct khugepaged_mm_slot { struct mm_slot slot; }; /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure. */ struct khugepaged_scan { struct list_head mm_head; struct khugepaged_mm_slot *mm_slot; unsigned long address; }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int pages; int err; err = kstrtouint(buf, 10, &pages); if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; return count; } static struct kobj_attribute pages_to_scan_attr = __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR_RW(max_ptes_none); static ssize_t max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t max_ptes_swap_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; return count; } static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR_RW(max_ptes_swap); static ssize_t max_ptes_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t max_ptes_shared_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; return count; } static struct kobj_attribute khugepaged_max_ptes_shared_attr = __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_swap_attr.attr, &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, NULL, }; struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: #ifdef CONFIG_S390 /* * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 * can't handle this properly after s390_enable_sie, so we simply * ignore the madvise to prevent qemu from causing a SIGSEGV. */ if (mm_has_pgste(vma->vm_mm)) return 0; #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon. */ khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set. */ break; } return 0; } int __init khugepaged_init(void) { mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } void __init khugepaged_destroy(void) { kmem_cache_destroy(mm_slot_cache); } static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || test_bit(MMF_DISABLE_THP, &mm->flags); } static bool hugepage_pmd_enabled(void) { /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_always)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) return true; if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && hugepage_global_enabled()) return true; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) return true; return false; } void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) return; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; slot = &mm_slot->slot; spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); } void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); } } static void release_pte_folio(struct folio *folio) { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_unlock(folio); folio_putback_lru(folio); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) continue; pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; folio = pfn_folio(pfn); if (folio_test_large(folio)) continue; release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { list_del(&folio->lru); release_pte_folio(folio); } } static bool is_refcount_suitable(struct folio *folio) { int expected_refcount = folio_mapcount(folio); if (!folio_test_anon(folio) || folio_test_swapcache(folio)) expected_refcount += folio_nr_pages(folio); if (folio_test_private(folio)) expected_refcount++; return folio_ref_count(folio) == expected_refcount; } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } if (!pte_present(pteval)) { result = SCAN_PTE_NON_PRESENT; goto out; } if (pte_uffd_wp(pteval)) { result = SCAN_PTE_UFFD_WP; goto out; } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } } if (folio_test_large(folio)) { struct folio *f; /* * Check if we have dealt with the compound page * already */ list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next; } } /* * We can do it before folio_isolate_lru because the * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ if (!is_refcount_suitable(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (!folio_isolate_lru(folio)) { folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (folio_test_large(folio)) list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; if (pte_write(pteval)) writable = true; } if (unlikely(!writable)) { result = SCAN_PAGE_RO; } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { struct folio *src, *tmp; pte_t *_pte; pte_t pteval; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* * ptl mostly unnecessary. */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { struct page *src_page = pte_page(pteval); src = page_folio(src_page); if (!folio_test_large(src)) release_pte_folio(src); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { list_del(&src->lru); node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); free_swap_cache(src); folio_putback_lru(src); } } static void __collapse_huge_page_copy_failed(pte_t *pte, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary. */ pmd_ptl = pmd_lock(vma->vm_mm, pmd); pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); } /* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ for (i = 0; i < HPAGE_PMD_NR; i++) { pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } } if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, compound_pagelist); return result; } static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; /* * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) { max_value = cc->node_load[nid]; target_node = nid; } for_each_online_node(nid) { if (max_value == cc->node_load[nid]) node_set(nid, cc->alloc_nmask); } return target_node; } #else static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED; } static inline int check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_devmap(pmde)) return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; } static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) return SCAN_PMD_NULL; return check_pmd_state(*pmd); } static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; for (address = haddr; address < end; address += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = address, .pgoff = linear_page_index(vma, address), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; if (!pte++) { /* * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); if (!is_swap_pte(vmf.orig_pte)) continue; vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */ pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */ result = SCAN_PAGE_LOCK; goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); result = SCAN_FAIL; goto out; } swapped_in++; } if (pte) pte_unmap(pte); /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); result = SCAN_SUCCEED; out: trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); return result; } static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); *foliop = folio; return SCAN_SUCCEED; } static int collapse_huge_page(struct mm_struct *mm, unsigned long address, int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, referenced); if (result != SCAN_SUCCEED) goto out_nolock; } mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_PMD_NULL; } if (unlikely(result != SCAN_SUCCEED)) { if (pte) pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); deferred_split_folio(folio, false); spin_unlock(pmd_ptl); folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (folio) folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } folio = page_folio(page); if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * We treat a single page as shared if any part of the THP * is shared. "False negatives" from * folio_likely_mapped_shared() are not expected to matter * much in practice. */ if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; } else if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, address, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, none_or_zero, result, unmapped); return result; } static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); */ /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } #ifdef CONFIG_SHMEM /* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) { struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, .pmd = pmdp, }; VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); if (do_set_pmd(&vmf, hpage)) return SCAN_FAIL; get_page(hpage); return SCAN_SUCCEED; } /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); /* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL; if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); /* empty pte, skip */ if (pte_none(ptent)) continue; /* page swapped out, abort */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ if (folio_page(folio, i) != page) goto abort; } pte_unmap_unlock(start_pte, ptl); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; /* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) spin_lock(ptl); else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) goto abort; /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (folio_page(folio, i) != page) goto abort; /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ ptep_clear(mm, addr, pte); folio_remove_rmap_pte(folio, page, vma); nr_ptes++; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { flush_tlb_mm(mm); goto unlock; } } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); pte_unmap_unlock(start_pte, ptl); if (ptl != pml) spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, haddr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: if (nr_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_folio: folio_unlock(folio); folio_put(folio); return result; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { struct mmu_notifier_range range; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later. */ if (READ_ONCE(vma->anon_vma)) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; if (hpage_collapse_test_exit(mm)) continue; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges. */ if (userfaultfd_wp(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); /* * The lock of new_folio is still held, we will be blocked in * the page fault path, which prevents the pte entries from * being set again. So even though the old empty PTE page may be * concurrently freed and a new PTE page is filled into the pmd * entry, it is still empty and can be removed. * * So here we only need to recheck if the state of pmd entry * still meets our requirements, rather than checking pmd_same() * like elsewhere. */ if (check_pmd_state(pmd) != SCAN_SUCCEED) goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* * Huge page lock is still held, so normally the page table * must remain empty; and we have already skipped anon_vma * and userfaultfd_wp() vmas. But since the mmap_lock is not * held, it is still possible for a racing userfaultfd_ioctl() * to have inserted ptes or markers. Now that we hold ptlock, * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; } if (ptl != pml) spin_unlock(ptl); drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } i_mmap_unlock_read(mapping); } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache, locking old pages * + swap/gup in pages if necessary; * - copy data to new page * - handle shmem holes * + re-validate that holes weren't filled by someone else * + check for userfaultfd * - finalize updates to the page cache; * - if replacing succeeds: * + unlock huge page; * + free old pages; * - if replacing failed; * + unlock old pages * + unlock and free huge page; */ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; mapping_set_update(&xas, mapping); __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); new_folio->index = start; new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present */ do { xas_lock_irq(&xas); xas_create_range(&xas); if (!xas_error(&xas)) break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; goto rollback; } } while (1); for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty. */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; goto xa_locked; } } nr_none++; index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop. */ xas_unlock_irq(&xas); filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } /* * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); /* make sure the folio is up to date */ if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } if (!is_shmem && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; goto out_unlock; } if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; } if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* * We control 2 + nr_pages references to the folio: * - we hold a pin on it; * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page * cache. That requires locking the folio to handle * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); goto out_unlock; } /* * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); folio_put(folio); goto xa_unlocked; } if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure i_writecount is up to date and the update to nr_thps * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; filemap_nr_thps_dec(mapping); } } xa_locked: xas_unlock_irq(&xas); xa_unlocked: /* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state. */ try_to_unmap_flush(); if (result == SCAN_SUCCEED && nr_none && !shmem_charge(mapping->host, nr_none)) result = SCAN_FAIL; if (result != SCAN_SUCCEED) { nr_none = 0; goto rollback; } /* * The old folios are locked, so they won't change anymore. */ index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { int i, nr_pages = folio_nr_pages(folio); while (index < folio->index) { clear_highpage(dst); index++; dst++; } for (i = 0; i < nr_pages; i++) { if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; dst++; } } while (index < end) { clear_highpage(dst); index++; dst++; } if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0; i_mmap_lock_read(mapping); xas_lock_irq(&xas); xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) { xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) { result = SCAN_STORE_FAILED; goto immap_locked; } nr_none_check++; } } if (nr_none != nr_none_check) { result = SCAN_PAGE_FILLED; goto immap_locked; } /* * If userspace observed a missing page in a VMA with * a MODE_MISSING userfaultfd, then it might expect a * UFFD_EVENT_PAGEFAULT for that page. If so, we need to * roll back to avoid suppressing such an event. Since * wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing * page with a zero page, so they don't matter here. * * Any userfaultfds registered after this point will * not be able to observe any missing pages due to the * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { result = SCAN_EXCEED_NONE_PTE; goto immap_locked; } } immap_locked: i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) { xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY) xas_store(&xas, NULL); } xas_unlock_irq(&xas); goto rollback; } } else { xas_lock_irq(&xas); } if (is_shmem) __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* * Mark new_folio as uptodate before inserting it into the * page cache so that it isn't mistaken for an fallocated but * unwritten page. */ folio_mark_uptodate(new_folio); folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) folio_mark_dirty(new_folio); folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); /* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; folio_unlock(new_folio); /* * The collapse has succeeded, so free the old folios. */ list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio->mapping = NULL; folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); folio_unlock(folio); folio_putback_lru(folio); folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC. */ if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure the update to nr_thps is visible. */ smp_mb(); } new_folio->mapping = NULL; folio_unlock(new_folio); folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); return result; } static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) { swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; break; } if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; break; } /* * We probably should check if the folio is referenced * here, but nobody would transfer pte_young() to * folio_test_referenced() for us. And rmap walk here * is just too costly... */ present += folio_nr_pages(folio); if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } } trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } #else static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; slot = &mm_slot->slot; } else { slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. */ vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { bool mmap_locked = true; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; mmap_read_unlock(mm); } } else { *result = hpage_collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; if (!mmap_locked) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage * allocations, so if allocation fails, we are * guaranteed to break here and report the * correct result back to caller. */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ if (slot->mm_node.next != &khugepaged_scan.mm_head) { slot = list_entry(slot->mm_node.next, struct mm_slot, mm_node); khugepaged_scan.mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } collect_mm_slot(mm_slot); } return progress; } static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || kthread_should_stop(); } static void khugepaged_do_scan(struct collapse_control *cc) { unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; int result = SCAN_SUCCEED; lru_add_drain_all(); while (true) { cond_resched(); if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); if (progress >= pages) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. */ if (!wait) break; wait = false; khugepaged_alloc_sleep(); } } } static bool khugepaged_should_wakeup(void) { return kthread_should_stop() || time_after_eq(jiffies, khugepaged_sleep_expire); } static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); return 0; } static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; unsigned long recommended_min; if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of * ZONE_MOVABLE since it only has movable pages. */ if (zone_idx(zone) > gfp_zone(GFP_USER)) continue; nr_zones++; } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_nr_pages * nr_zones * MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; /* don't ever allow to reserve more than 5% of the lowmem */ recommended_min = min(recommended_min, (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { int err = 0; mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } bool current_is_khugepaged(void) { return kthread_func(current) == khugepaged; } static int madvise_collapse_errno(enum scan_result r) { /* * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide * actionable feedback to caller, so they may take an appropriate * fallback measure depending on the nature of the failure. */ switch (r) { case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to * specified memory range. khugepaged likely won't be able to collapse * either. */ default: return -EINVAL; } } int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; int thps = 0, last_fail = SCAN_FAIL; bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); *prev = vma; if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; cc->is_khugepaged = false; mmgrab(mm); lru_add_drain_all(); hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; } hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); mmap_read_unlock(mm); mmap_locked = false; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: last_fail = result; /* Other error, exit */ goto out_maybelock; } } out_maybelock: /* Caller expects us to hold mmap_lock on return */ if (!mmap_locked) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); mmdrop(mm); kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 11