| 22 4 5 14 19 20 6 6 1 5 5 1 2 1 1 2 1 2 5 5 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS regular file handling primitives including fsync(). * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Amagai Yoshiji and Ryusuke Konishi. */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/writeback.h> #include "nilfs.h" #include "segment.h" int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { /* * Called from fsync() system call * This is the only entry point that can catch write and synch * timing for both data blocks and intermediate blocks. * * This function should be implemented when the writeback function * will be implemented. */ struct the_nilfs *nilfs; struct inode *inode = file->f_mapping->host; int err = 0; if (nilfs_inode_dirty(inode)) { if (datasync) err = nilfs_construct_dsync_segment(inode->i_sb, inode, start, end); else err = nilfs_construct_segment(inode->i_sb); } nilfs = inode->i_sb->s_fs_info; if (!err) err = nilfs_flush_device(nilfs); return err; } static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; struct buffer_head *bh, *head; int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ sb_start_pagefault(inode->i_sb); folio_lock(folio); if (folio->mapping != inode->i_mapping || folio_pos(folio) >= i_size_read(inode) || !folio_test_uptodate(folio)) { folio_unlock(folio); ret = -EFAULT; /* make the VM retry the fault */ goto out; } /* * check to see if the folio is mapped already (no holes) */ if (folio_test_mappedtodisk(folio)) goto mapped; head = folio_buffers(folio); if (head) { int fully_mapped = 1; bh = head; do { if (!buffer_mapped(bh)) { fully_mapped = 0; break; } } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { folio_set_mappedtodisk(folio); goto mapped; } } folio_unlock(folio); /* * fill hole blocks */ ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) goto out; file_update_time(vma->vm_file); ret = block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: /* * Since checksumming including data blocks is performed to determine * the validity of the log to be written and used for recovery, it is * necessary to wait for writeback to finish here, regardless of the * stable write requirement of the backing device. */ folio_wait_writeback(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, }; static int nilfs_file_mmap_prepare(struct vm_area_desc *desc) { file_accessed(desc->file); desc->vm_ops = &nilfs_file_vm_ops; return 0; } /* * We have mostly NULL's here: the current defaults are ok for * the nilfs filesystem. */ const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, #endif /* CONFIG_COMPAT */ .mmap_prepare = nilfs_file_mmap_prepare, .open = generic_file_open, /* .release = nilfs_release_file, */ .fsync = nilfs_sync_file, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .setlease = generic_setlease, }; const struct inode_operations nilfs_file_inode_operations = { .setattr = nilfs_setattr, .permission = nilfs_permission, .fiemap = nilfs_fiemap, .fileattr_get = nilfs_fileattr_get, .fileattr_set = nilfs_fileattr_set, }; /* end of file */ |
| 11 6249 6249 32 6163 4 6003 1 5231 5241 628 4 6158 599 16 16 10 16 5799 193 6159 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, struct elevator_tags *tags, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return queue_hctx((q), (q->tag_set->map[type].mq_map[cpu])); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * Default to double of smaller one between hw queue_depth and * 128, since we don't split into sync/async like the old code * did. Additionally, this is a per-hw queue depth. */ static inline unsigned int blk_mq_default_nr_requests( struct blk_mq_tag_set *set) { return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, unsigned int nr); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif |
| 59 59 60 60 59 1 15 1 5 5 6 10 9 2 1 10 1 6 14 6 8 12 2 1 2 1 2 2 1 3 3 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/options.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Option parsing */ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nls.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> #include "hfsplus_fs.h" enum { opt_creator, opt_type, opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, opt_decompose, opt_barrier, opt_force, }; static const struct fs_parameter_spec hfs_param_spec[] = { fsparam_string ("creator", opt_creator), fsparam_string ("type", opt_type), fsparam_u32oct ("umask", opt_umask), fsparam_u32 ("uid", opt_uid), fsparam_u32 ("gid", opt_gid), fsparam_u32 ("part", opt_part), fsparam_u32 ("session", opt_session), fsparam_string ("nls", opt_nls), fsparam_flag_no ("decompose", opt_decompose), fsparam_flag_no ("barrier", opt_barrier), fsparam_flag ("force", opt_force), {} }; /* Initialize an options object to reasonable defaults */ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) { if (!opts) return; opts->creator = HFSPLUS_DEF_CR_TYPE; opts->type = HFSPLUS_DEF_CR_TYPE; opts->umask = current_umask(); opts->uid = current_uid(); opts->gid = current_gid(); opts->part = -1; opts->session = -1; } /* Parse options from mount. Returns nonzero errno on failure */ int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hfsplus_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; /* * Only the force option is examined during remount, all others * are ignored. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && strncmp(param->key, "force", 5)) return 0; opt = fs_parse(fc, hfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case opt_creator: if (strlen(param->string) != 4) { pr_err("creator requires a 4 character value\n"); return -EINVAL; } memcpy(&sbi->creator, param->string, 4); break; case opt_type: if (strlen(param->string) != 4) { pr_err("type requires a 4 character value\n"); return -EINVAL; } memcpy(&sbi->type, param->string, 4); break; case opt_umask: sbi->umask = (umode_t)result.uint_32; break; case opt_uid: sbi->uid = result.uid; set_bit(HFSPLUS_SB_UID, &sbi->flags); break; case opt_gid: sbi->gid = result.gid; set_bit(HFSPLUS_SB_GID, &sbi->flags); break; case opt_part: sbi->part = result.uint_32; break; case opt_session: sbi->session = result.uint_32; break; case opt_nls: if (sbi->nls) { pr_err("unable to change nls mapping\n"); return -EINVAL; } sbi->nls = load_nls(param->string); if (!sbi->nls) { pr_err("unable to load nls mapping \"%s\"\n", param->string); return -EINVAL; } break; case opt_decompose: if (result.negated) set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); else clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_barrier: if (result.negated) set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); else clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); break; case opt_force: set_bit(HFSPLUS_SB_FORCE, &sbi->flags); break; default: return -EINVAL; } return 0; } int hfsplus_show_options(struct seq_file *seq, struct dentry *root) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, from_kuid_munged(&init_user_ns, sbi->uid), from_kgid_munged(&init_user_ns, sbi->gid)); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_puts(seq, ",nodecompose"); if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) seq_puts(seq, ",nobarrier"); return 0; } |
| 1 5 5 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/netlink.h> #include <linux/u64_stats_sync.h> struct macvlan_port; #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) struct macvlan_dev { struct net_device *dev; struct list_head list; struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; netdevice_tracker dev_tracker; void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); netdev_features_t set_features; enum macvlan_mode mode; u16 flags; unsigned int macaddr_count; u32 bc_queue_len_req; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, unsigned int len, bool success, bool multicast) { if (likely(success)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = get_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, len); if (multicast) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); put_cpu_ptr(vlan->pcpu_stats); } else { this_cpu_inc(vlan->pcpu_stats->rx_errors); } } extern void macvlan_common_setup(struct net_device *dev); struct rtnl_newlink_params; extern int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); extern void macvlan_dellink(struct net_device *dev, struct list_head *head); extern int macvlan_link_register(struct rtnl_link_ops *ops); #if IS_ENABLED(CONFIG_MACVLAN) static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->lowerdev; } #else static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { BUG(); return NULL; } #endif static inline void *macvlan_accel_priv(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->accel_priv; } static inline bool macvlan_supports_dest_filter(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->mode == MACVLAN_MODE_PRIVATE || macvlan->mode == MACVLAN_MODE_VEPA || macvlan->mode == MACVLAN_MODE_BRIDGE; } static inline int macvlan_release_l2fw_offload(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); macvlan->accel_priv = NULL; return dev_uc_add(macvlan->lowerdev, dev->dev_addr); } #endif /* _LINUX_IF_MACVLAN_H */ |
| 7 17 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_H #define _ASM_X86_PVCLOCK_H #include <asm/clocksource.h> #include <asm/pvclock-abi.h> struct timespec64; /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec64 *ts); void pvclock_resume(void); void pvclock_touch_watchdogs(void); static __always_inline unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) { unsigned version = src->version & ~1; /* Make sure that the version is read before the data. */ virt_rmb(); return version; } static __always_inline bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, unsigned version) { /* Make sure that the version is re-read after the data. */ virt_rmb(); return unlikely(version != src->version); } /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ u32 tmp1, tmp2; #else ulong tmp; #endif if (shift < 0) delta >>= -shift; else delta <<= shift; #ifdef __i386__ __asm__ ( "mul %5 ; " "mov %4,%%eax ; " "mov %%edx,%4 ; " "mul %5 ; " "xor %5,%5 ; " "add %4,%%eax ; " "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a"(product), [hi]"=d"(tmp) : "0"(delta), [mul_frac]"rm"((u64)mul_frac)); #else #error implement me! #endif return product; } static __always_inline u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc) { u64 delta = tsc - src->tsc_timestamp; u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; } struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) #ifdef CONFIG_PARAVIRT_CLOCK void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti); struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void); #else static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) { return NULL; } #endif #endif /* _ASM_X86_PVCLOCK_H */ |
| 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Stream Parser * * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #ifndef __NET_STRPARSER_H_ #define __NET_STRPARSER_H_ #include <linux/skbuff.h> #include <net/sock.h> #define STRP_STATS_ADD(stat, count) ((stat) += (count)) #define STRP_STATS_INCR(stat) ((stat)++) struct strp_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; }; struct strp_aggr_stats { unsigned long long msgs; unsigned long long bytes; unsigned int mem_fail; unsigned int need_more_hdr; unsigned int msg_too_big; unsigned int msg_timeouts; unsigned int bad_hdr_len; unsigned int aborts; unsigned int interrupted; unsigned int unrecov_intr; }; struct strparser; /* Callbacks are called with lock held for the attached socket */ struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); void (*unlock)(struct strparser *strp); }; struct strp_msg { int full_len; int offset; }; struct _strp_msg { /* Internal cb structure. struct strp_msg must be first for passing * to upper layer. */ struct strp_msg strp; int accum_len; }; struct sk_skb_cb { #define SK_SKB_CB_PRIV_LEN 20 unsigned char data[SK_SKB_CB_PRIV_LEN]; /* align strp on cache line boundary within skb->cb[] */ unsigned char pad[4]; struct _strp_msg strp; /* strp users' data follows */ struct tls_msg { u8 control; } tls; /* temp_reg is a temporary register used for bpf_convert_data_end_access * when dst_reg == src_reg. */ u64 temp_reg; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ struct strparser { struct sock *sk; u32 stopped : 1; u32 paused : 1; u32 aborted : 1; u32 interrupted : 1; u32 unrecov_intr : 1; struct sk_buff **skb_nextp; struct sk_buff *skb_head; unsigned int need_bytes; struct delayed_work msg_timer_work; struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; }; /* Must be called with lock held for attached socket */ static inline void strp_pause(struct strparser *strp) { strp->paused = 1; } /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) { /* Save psock statistics in the mux when psock is being unattached. */ #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += \ strp->stats._stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); #undef SAVE_PSOCK_STATS if (strp->aborted) agg_stats->aborts++; if (strp->interrupted) agg_stats->interrupted++; if (strp->unrecov_intr) agg_stats->unrecov_intr++; } static inline void aggregate_strp_stats(struct strp_aggr_stats *stats, struct strp_aggr_stats *agg_stats) { #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) SAVE_PSOCK_STATS(msgs); SAVE_PSOCK_STATS(bytes); SAVE_PSOCK_STATS(mem_fail); SAVE_PSOCK_STATS(need_more_hdr); SAVE_PSOCK_STATS(msg_too_big); SAVE_PSOCK_STATS(msg_timeouts); SAVE_PSOCK_STATS(bad_hdr_len); SAVE_PSOCK_STATS(aborts); SAVE_PSOCK_STATS(interrupted); SAVE_PSOCK_STATS(unrecov_intr); #undef SAVE_PSOCK_STATS } void strp_done(struct strparser *strp); void strp_stop(struct strparser *strp); void strp_check_rcv(struct strparser *strp); int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb); void strp_data_ready(struct strparser *strp); int strp_process(struct strparser *strp, struct sk_buff *orig_skb, unsigned int orig_offset, size_t orig_len, size_t max_msg_size, long timeo); #endif /* __NET_STRPARSER_H_ */ |
| 2 2 2 2 2 2 2 8 8 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 | // SPDX-License-Identifier: GPL-2.0-only /* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying * cluster stacks. * * Copyright (C) 2007, 2009 Oracle. All rights reserved. */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/sysctl.h> #include "ocfs2_fs.h" #include "stackglue.h" #define OCFS2_STACK_PLUGIN_O2CB "o2cb" #define OCFS2_STACK_PLUGIN_USER "user" #define OCFS2_MAX_HB_CTL_PATH 256 static struct ocfs2_protocol_version locking_max_version; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; /* * The stack currently in use. If not null, active_stack->sp_count > 0, * the module is pinned, and the locking protocol cannot be changed. */ static struct ocfs2_stack_plugin *active_stack; static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; assert_spin_locked(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { if (!strcmp(p->sp_name, name)) return p; } return NULL; } static int ocfs2_stack_driver_request(const char *stack_name, const char *plugin_name) { int rc; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); /* * If the stack passed by the filesystem isn't the selected one, * we can't continue. */ if (strcmp(stack_name, cluster_stack_name)) { rc = -EBUSY; goto out; } if (active_stack) { /* * If the active stack isn't the one we want, it cannot * be selected right now. */ if (!strcmp(active_stack->sp_name, plugin_name)) rc = 0; else rc = -EBUSY; goto out; } p = ocfs2_stack_lookup(plugin_name); if (!p || !try_module_get(p->sp_owner)) { rc = -ENOENT; goto out; } active_stack = p; rc = 0; out: /* If we found it, pin it */ if (!rc) active_stack->sp_count++; spin_unlock(&ocfs2_stack_lock); return rc; } /* * This function looks up the appropriate stack and makes it active. If * there is no stack, it tries to load it. It will fail if the stack still * cannot be found. It will also fail if a different stack is in use. */ static int ocfs2_stack_driver_get(const char *stack_name) { int rc; char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; /* * Classic stack does not pass in a stack name. This is * compatible with older tools as well. */ if (!stack_name || !*stack_name) stack_name = OCFS2_STACK_PLUGIN_O2CB; if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { printk(KERN_ERR "ocfs2 passed an invalid cluster stack label: \"%s\"\n", stack_name); return -EINVAL; } /* Anything that isn't the classic stack is a user stack */ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) plugin_name = OCFS2_STACK_PLUGIN_USER; rc = ocfs2_stack_driver_request(stack_name, plugin_name); if (rc == -ENOENT) { request_module("ocfs2_stack_%s", plugin_name); rc = ocfs2_stack_driver_request(stack_name, plugin_name); } if (rc == -ENOENT) { printk(KERN_ERR "ocfs2: Cluster stack driver \"%s\" cannot be found\n", plugin_name); } else if (rc == -EBUSY) { printk(KERN_ERR "ocfs2: A different cluster stack is in use\n"); } return rc; } static void ocfs2_stack_driver_put(void) { spin_lock(&ocfs2_stack_lock); BUG_ON(active_stack == NULL); BUG_ON(active_stack->sp_count == 0); active_stack->sp_count--; if (!active_stack->sp_count) { module_put(active_stack->sp_owner); active_stack = NULL; } spin_unlock(&ocfs2_stack_lock); } int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) { int rc; spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; plugin->sp_max_proto = locking_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); rc = 0; } else { printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", plugin->sp_name); rc = -EEXIST; } spin_unlock(&ocfs2_stack_lock); return rc; } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); p = ocfs2_stack_lookup(plugin->sp_name); if (p) { BUG_ON(p != plugin); BUG_ON(plugin == active_stack); BUG_ON(plugin->sp_count != 0); list_del_init(&plugin->sp_list); printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", plugin->sp_name); } else { printk(KERN_ERR "Stack \"%s\" is not registered\n", plugin->sp_name); } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); if (memcmp(max_proto, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { BUG_ON(locking_max_version.pv_major != 0); locking_max_version = *max_proto; list_for_each_entry(p, &ocfs2_stack_list, sp_list) { p->sp_max_proto = locking_max_version; } } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); /* * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument * for the ast and bast functions. They will pass the lksb to the ast * and bast. The caller can wrap the lksb with their own structure to * get more information. */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { if (!lksb->lksb_conn) lksb->lksb_conn = conn; else BUG_ON(lksb->lksb_conn != conn); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_status(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lvb_valid(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_lvb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { active_stack->sp_ops->dump_lksb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); int ocfs2_stack_supports_plocks(void) { return active_stack && active_stack->sp_ops->plock; } EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); /* * ocfs2_plock() can only be safely called if * ocfs2_stack_supports_plocks() returned true */ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, struct file *file, int cmd, struct file_lock *fl) { WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); if (active_stack->sp_ops->plock) return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, const char *cluster_name, int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { int rc = 0; struct ocfs2_cluster_connection *new_conn; BUG_ON(group == NULL); BUG_ON(conn == NULL); BUG_ON(recovery_handler == NULL); if (grouplen > GROUP_NAME_MAX) { rc = -EINVAL; goto out; } if (memcmp(&lproto->lp_max_version, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { rc = -EINVAL; goto out; } new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), GFP_KERNEL); if (!new_conn) { rc = -ENOMEM; goto out; } strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; new_conn->cc_proto = lproto; /* Start the new connection at our maximum compatibility level */ new_conn->cc_version = lproto->lp_max_version; /* This will pin the stack driver if successful */ rc = ocfs2_stack_driver_get(stack_name); if (rc) goto out_free; rc = active_stack->sp_ops->connect(new_conn); if (rc) { ocfs2_stack_driver_put(); goto out_free; } *conn = new_conn; out_free: if (rc) kfree(new_conn); out: return rc; } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); /* The caller will ensure all nodes have the same cluster stack */ int ocfs2_cluster_connect_agnostic(const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { char *stack_name = NULL; if (cluster_stack_name[0]) stack_name = cluster_stack_name; return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, lproto, recovery_handler, recovery_data, conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); /* If hangup_pending is 0, the stack driver will be dropped */ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending) { int ret; BUG_ON(conn == NULL); ret = active_stack->sp_ops->disconnect(conn); /* XXX Should we free it anyway? */ if (!ret) { kfree(conn); if (!hangup_pending) ocfs2_stack_driver_put(); } return ret; } EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); /* * Leave the group for this filesystem. This is executed by a userspace * program (stored in ocfs2_hb_ctl_path). */ static void ocfs2_leave_group(const char *group) { int ret; char *argv[5], *envp[3]; argv[0] = ocfs2_hb_ctl_path; argv[1] = "-K"; argv[2] = "-u"; argv[3] = (char *)group; argv[4] = NULL; /* minimal command environment taken from cpu_run_sbin_hotplug */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) { printk(KERN_ERR "ocfs2: Error %d running user helper " "\"%s %s %s %s\"\n", ret, argv[0], argv[1], argv[2], argv[3]); } } /* * Hangup is a required post-umount. ocfs2-tools software expects the * filesystem to call "ocfs2_hb_ctl" during unmount. This happens * regardless of whether the DLM got started, so we can't do it * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does * the actual work. */ void ocfs2_cluster_hangup(const char *group, int grouplen) { BUG_ON(group == NULL); BUG_ON(group[grouplen] != '\0'); ocfs2_leave_group(group); /* cluster_disconnect() was called with hangup_pending==1 */ ocfs2_stack_driver_put(); } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); /* * Sysfs bits */ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (locking_max_version.pv_major) ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", locking_max_version.pv_major, locking_max_version.pv_minor); spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_max_locking_protocol = __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0, total = 0, remain = PAGE_SIZE; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; } total += ret; remain -= ret; } spin_unlock(&ocfs2_stack_lock); return total; } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret; spin_lock(&ocfs2_stack_lock); ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); spin_unlock(&ocfs2_stack_lock); return ret; } static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { size_t len = count; ssize_t ret; if (len == 0) return len; if (buf[len - 1] == '\n') len--; if ((len != OCFS2_STACK_LABEL_LEN) || (strnlen(buf, len) != len)) return -EINVAL; spin_lock(&ocfs2_stack_lock); if (active_stack) { if (!strncmp(buf, cluster_stack_name, len)) ret = count; else ret = -EBUSY; } else { memcpy(cluster_stack_name, buf, len); ret = count; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_cluster_stack = __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "1\n"); } static struct kobj_attribute ocfs2_attr_dlm_recover_support = __ATTR(dlm_recover_callback_support, S_IRUGO, ocfs2_dlm_recover_show, NULL); static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, &ocfs2_attr_dlm_recover_support.attr, NULL, }; static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; struct kset *ocfs2_kset; EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { kset_unregister(ocfs2_kset); } static int ocfs2_sysfs_init(void) { int ret; ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); if (!ocfs2_kset) return -ENOMEM; ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); if (ret) goto error; return 0; error: kset_unregister(ocfs2_kset); return ret; } /* * Sysctl bits * * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't * make as much sense in a multiple cluster stack world, but it's safer * and easier to preserve the name. */ static const struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, .maxlen = OCFS2_MAX_HB_CTL_PATH, .mode = 0644, .proc_handler = proc_dostring, }, }; static struct ctl_table_header *ocfs2_table_header; /* * Initialization */ static int __init ocfs2_stack_glue_init(void) { int ret; strscpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); return -ENOMEM; /* or something. */ } ret = ocfs2_sysfs_init(); if (ret) unregister_sysctl_table(ocfs2_table_header); return ret; } static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit); |
| 45 19 46 1 3 12 2 46 71 116 12 45 60 105 122 117 63 58 4 117 105 1 100 100 4 4 4 24 11 13 4 4 19 10 2 7 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 | // SPDX-License-Identifier: GPL-2.0-or-later /* Local endpoint object management * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/ip.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/af_rxrpc.h> #include "ar-internal.h" static void rxrpc_local_rcu(struct rcu_head *); /* * Handle an ICMP/ICMP6 error turning up at the tunnel. Push it through the * usual mechanism so that it gets parsed and presented through the UDP * socket's error_report(). */ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { if (ip_hdr(skb)->version == IPVERSION) return ip_icmp_error(sk, skb, err, port, info, payload); if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) return ipv6_icmp_error(sk, skb, err, port, info, payload); } /* * Set or clear the Don't Fragment flag on a socket. */ void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set) { if (set) ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); else ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DONT); } /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. * * We explicitly don't compare the RxRPC service ID as we want to reject * conflicting uses by differing services. Further, we don't want to share * addresses with different options (IPv6), so we don't compare those bits * either. */ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { long diff; diff = ((local->srx.transport_type - srx->transport_type) ?: (local->srx.transport_len - srx->transport_len) ?: (local->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: /* If the choice of UDP port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&local->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: /* If the choice of UDP6 port is left up to the transport, then * the endpoint record doesn't match. */ return ((u16 __force)local->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&local->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_local *local = container_of(timer, struct rxrpc_local, client_conn_reap_timer); if (!local->kill_all_client_conns && test_and_set_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_wake_up_io_thread(local); } /* * Allocate a new local endpoint. */ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; u32 tmp; local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); local->net = net; local->rxnet = rxrpc_net(net); INIT_HLIST_NODE(&local->link); init_completion(&local->io_thread_ready); #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY skb_queue_head_init(&local->rx_delay_queue); #endif skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->conn_attend_q); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); local->kill_all_client_conns = false; INIT_LIST_HEAD(&local->idle_client_conns); timer_setup(&local->client_conn_reap_timer, rxrpc_client_conn_reap_timeout, 0); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; idr_init(&local->conn_ids); get_random_bytes(&tmp, sizeof(tmp)); tmp &= 0x3fffffff; if (tmp == 0) tmp = 1; idr_set_cursor(&local->conn_ids, tmp); INIT_LIST_HEAD(&local->new_client_calls); spin_lock_init(&local->client_call_lock); trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } _leave(" = %p", local); return local; } /* * create the local socket * - must be called with rxrpc_local_mutex locked */ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct sockaddr_rxrpc *srx = &local->srx; struct udp_port_cfg udp_conf = {0}; struct task_struct *io_thread; struct sock *usk; int ret; _enter("%p{%d,%d}", local, srx->transport_type, srx->transport.family); udp_conf.family = srx->transport.family; udp_conf.use_udp_checksums = true; if (udp_conf.family == AF_INET) { udp_conf.local_ip = srx->transport.sin.sin_addr; udp_conf.local_udp_port = srx->transport.sin.sin_port; #if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) } else { udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; udp_conf.local_udp_port = srx->transport.sin6.sin6_port; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; #endif } ret = udp_sock_create(net, &udp_conf, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; } tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_encap_rcv; tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); /* set the socket up */ usk = local->socket->sk; usk->sk_error_report = rxrpc_error_report; switch (srx->transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ ip6_sock_set_recverr(usk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. */ fallthrough; case AF_INET: /* we want to receive ICMP errors */ ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); break; default: BUG(); } io_thread = kthread_run(rxrpc_io_thread, local, "krxrpcio/%u", ntohs(udp_conf.local_udp_port)); if (IS_ERR(io_thread)) { ret = PTR_ERR(io_thread); goto error_sock; } wait_for_completion(&local->io_thread_ready); WRITE_ONCE(local->io_thread, io_thread); _leave(" = 0"); return 0; error_sock: kernel_sock_shutdown(local->socket, SHUT_RDWR); local->socket->sk->sk_user_data = NULL; sock_release(local->socket); local->socket = NULL; return ret; } /* * Look up or create a new local endpoint using the specified local address. */ struct rxrpc_local *rxrpc_lookup_local(struct net *net, const struct sockaddr_rxrpc *srx) { struct rxrpc_local *local; struct rxrpc_net *rxnet = rxrpc_net(net); struct hlist_node *cursor; long diff; int ret; _enter("{%d,%d,%pISp}", srx->transport_type, srx->transport.family, &srx->transport); mutex_lock(&rxnet->local_mutex); hlist_for_each(cursor, &rxnet->local_endpoints) { local = hlist_entry(cursor, struct rxrpc_local, link); diff = rxrpc_local_cmp_key(local, srx); if (diff != 0) continue; /* Services aren't allowed to share transport sockets, so * reject that here. It is possible that the object is dying - * but it may also still have the local transport address that * we want bound. */ if (srx->srx_service) { local = NULL; goto addr_in_use; } /* Found a match. We want to replace a dying object. * Attempting to bind the transport socket may still fail if * we're attempting to use a local address that the dying * object is still using. */ if (!rxrpc_use_local(local, rxrpc_local_use_lookup)) break; goto found; } local = rxrpc_alloc_local(net, srx); if (!local) goto nomem; ret = rxrpc_open_socket(local, net); if (ret < 0) goto sock_error; if (cursor) { hlist_replace_rcu(cursor, &local->link); cursor->pprev = NULL; } else { hlist_add_head_rcu(&local->link, &rxnet->local_endpoints); } found: mutex_unlock(&rxnet->local_mutex); _leave(" = %p", local); return local; nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); if (local) call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); addr_in_use: mutex_unlock(&rxnet->local_mutex); _leave(" = -EADDRINUSE"); return ERR_PTR(-EADDRINUSE); } /* * Get a ref on a local endpoint. */ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; u = atomic_read(&local->active_users); __refcount_inc(&local->ref, &r); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } /* * Get a ref on a local endpoint unless its usage has already reached 0. */ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; if (local && __refcount_inc_not_zero(&local->ref, &r)) { u = atomic_read(&local->active_users); trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } return NULL; } /* * Drop a ref on a local endpoint. */ void rxrpc_put_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; bool dead; int r, u; if (local) { debug_id = local->debug_id; u = atomic_read(&local->active_users); dead = __refcount_dec_and_test(&local->ref, &r); trace_rxrpc_local(debug_id, why, r, u); if (dead) call_rcu(&local->rcu, rxrpc_local_rcu); } } /* * Start using a local endpoint. */ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { local = rxrpc_get_local_maybe(local, rxrpc_local_get_for_use); if (!local) return NULL; if (!__rxrpc_use_local(local, why)) { rxrpc_put_local(local, rxrpc_local_put_for_use); return NULL; } return local; } /* * Cease using a local endpoint. Once the number of active users reaches 0, we * start the closure of the transport in the I/O thread.. */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { unsigned int debug_id; int r, u; if (local) { debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); if (u == 0) kthread_stop(local->io_thread); } } /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. * * Closing the socket cannot be done from bottom half context or RCU callback * context because it might sleep. */ void rxrpc_destroy_local(struct rxrpc_local *local) { struct socket *socket = local->socket; struct rxrpc_net *rxnet = local->rxnet; _enter("%d", local->debug_id); local->dead = true; mutex_lock(&rxnet->local_mutex); hlist_del_init_rcu(&local->link); mutex_unlock(&rxnet->local_mutex); rxrpc_clean_up_local_conns(local); rxrpc_service_connection_reaper(&rxnet->service_conn_reaper); ASSERT(!local->service); if (socket) { local->socket = NULL; kernel_sock_shutdown(socket, SHUT_RDWR); socket->sk->sk_user_data = NULL; sock_release(socket); } /* At this point, there should be no more packets coming in to the * local endpoint. */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY rxrpc_purge_queue(&local->rx_delay_queue); #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); page_frag_cache_drain(&local->tx_alloc); } /* * Destroy a local endpoint after the RCU grace period expires. */ static void rxrpc_local_rcu(struct rcu_head *rcu) { struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); rxrpc_see_local(local, rxrpc_local_free); kfree(local); } /* * Verify the local endpoint list is empty by this point. */ void rxrpc_destroy_all_locals(struct rxrpc_net *rxnet) { struct rxrpc_local *local; _enter(""); flush_workqueue(rxrpc_workqueue); if (!hlist_empty(&rxnet->local_endpoints)) { mutex_lock(&rxnet->local_mutex); hlist_for_each_entry(local, &rxnet->local_endpoints, link) { pr_err("AF_RXRPC: Leaked local %p {%d}\n", local, refcount_read(&local->ref)); } mutex_unlock(&rxnet->local_mutex); BUG(); } } |
| 2 1 7 6 1 7 101 111 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SKCIPHER_H #define _CRYPTO_SKCIPHER_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> /* Set this bit if the lskcipher operation is a continuation. */ #define CRYPTO_LSKCIPHER_FLAG_CONT 0x00000001 /* Set this bit if the lskcipher operation is final. */ #define CRYPTO_LSKCIPHER_FLAG_FINAL 0x00000002 /* The bit CRYPTO_TFM_REQ_MAY_SLEEP can also be set if needed. */ /* Set this bit if the skcipher operation is a continuation. */ #define CRYPTO_SKCIPHER_REQ_CONT 0x00000001 /* Set this bit if the skcipher operation is not final. */ #define CRYPTO_SKCIPHER_REQ_NOTFINAL 0x00000002 struct scatterlist; /** * struct skcipher_request - Symmetric key cipher request * @cryptlen: Number of bytes to encrypt or decrypt * @iv: Initialisation Vector * @src: Source SG list * @dst: Destination SG list * @base: Underlying async request * @__ctx: Start of private context data */ struct skcipher_request { unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; struct crypto_async_request base; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_skcipher { unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_skcipher { struct crypto_skcipher base; }; struct crypto_lskcipher { struct crypto_tfm base; }; /* * struct skcipher_alg_common - common properties of skcipher_alg * @min_keysize: Minimum key size supported by the transformation. This is the * smallest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MIN_KEY_SIZE" include/crypto/ * @max_keysize: Maximum key size supported by the transformation. This is the * largest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MAX_KEY_SIZE" include/crypto/ * @ivsize: IV size applicable for transformation. The consumer must provide an * IV of exactly that size to perform the encrypt or decrypt operation. * @chunksize: Equal to the block size except for stream ciphers such as * CTR where it is set to the underlying block size. * @statesize: Size of the internal state for the algorithm. * @base: Definition of a generic crypto algorithm. */ #define SKCIPHER_ALG_COMMON { \ unsigned int min_keysize; \ unsigned int max_keysize; \ unsigned int ivsize; \ unsigned int chunksize; \ unsigned int statesize; \ \ struct crypto_alg base; \ } struct skcipher_alg_common SKCIPHER_ALG_COMMON; /** * struct skcipher_alg - symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt * the supplied scatterlist containing the blocks of data. The crypto * API consumer is responsible for aligning the entries of the * scatterlist properly and making sure the chunks are correctly * sized. In case a software fallback was put in place in the * @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. In case the * key was stored in transformation context, the key might need to be * re-programmed into the hardware in this function. This function * shall not modify the transformation context, as this function may * be called in parallel with the same transformation object. * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt * and the conditions are exactly the same. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @walksize: Equal to the chunk size except in cases where the algorithm is * considerably more efficient if it can operate on multiple chunks * in parallel. Should be a multiple of chunksize. * @co: see struct skcipher_alg_common * * All fields except @ivsize are mandatory and must be filled. */ struct skcipher_alg { int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct skcipher_request *req); int (*decrypt)(struct skcipher_request *req); int (*export)(struct skcipher_request *req, void *out); int (*import)(struct skcipher_request *req, const void *in); int (*init)(struct crypto_skcipher *tfm); void (*exit)(struct crypto_skcipher *tfm); unsigned int walksize; union { struct SKCIPHER_ALG_COMMON; struct skcipher_alg_common co; }; }; /** * struct lskcipher_alg - linear symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a number of bytes. This function is used to encrypt * the supplied data. This function shall not modify * the transformation context, as this function may be called * in parallel with the same transformation object. Data * may be left over if length is not a multiple of blocks * and there is more to come (final == false). The number of * left-over bytes should be returned in case of success. * The siv field shall be as long as ivsize + statesize with * the IV placed at the front. The state will be used by the * algorithm internally. * @decrypt: Decrypt a number of bytes. This is a reverse counterpart to * @encrypt and the conditions are exactly the same. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @co: see struct skcipher_alg_common */ struct lskcipher_alg { int (*setkey)(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*decrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*init)(struct crypto_lskcipher *tfm); void (*exit)(struct crypto_lskcipher *tfm); struct skcipher_alg_common co; }; #define MAX_SYNC_SKCIPHER_REQSIZE 384 /* * This performs a type-check against the "_tfm" argument to make sure * all users have the correct skcipher tfm for doing on-stack requests. */ #define SYNC_SKCIPHER_REQUEST_ON_STACK(name, _tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ MAX_SYNC_SKCIPHER_REQSIZE \ ] CRYPTO_MINALIGN_ATTR; \ struct skcipher_request *name = \ (((struct skcipher_request *)__##name##_desc)->base.tfm = \ crypto_sync_skcipher_tfm((_tfm)), \ (void *)__##name##_desc) /** * DOC: Symmetric Key Cipher API * * Symmetric key cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_SKCIPHER (listed as type "skcipher" in /proc/crypto). * * Asynchronous cipher operations imply that the function invocation for a * cipher request returns immediately before the completion of the operation. * The cipher request is scheduled as a separate kernel thread and therefore * load-balanced on the different CPUs via the process scheduler. To allow * the kernel crypto API to inform the caller about the completion of a cipher * request, the caller must provide a callback function. That function is * invoked with the cipher handle when the request completes. * * To support the asynchronous operation, additional information than just the * cipher handle must be supplied to the kernel crypto API. That additional * information is given by filling in the skcipher_request data structure. * * For the symmetric key cipher API, the state is maintained with the tfm * cipher handle. A single tfm can be used across multiple calls and in * parallel. For asynchronous block cipher calls, context data supplied and * only used by the caller can be referenced the request data structure in * addition to the IV used for the cipher request. The maintenance of such * state information would be important for a crypto driver implementer to * have, because when calling the callback function upon completion of the * cipher operation, that callback function may need some information about * which operation just finished if it invoked multiple in parallel. This * state information is unused by the kernel crypto API. */ static inline struct crypto_skcipher *__crypto_skcipher_cast( struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_skcipher, base); } /** * crypto_alloc_skcipher() - allocate symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an skcipher. The returned struct * crypto_skcipher is the cipher handle that is required for any subsequent * API invocation for that skcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name, u32 type, u32 mask); /** * crypto_alloc_lskcipher() - allocate linear symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * lskcipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an lskcipher. The returned struct * crypto_lskcipher is the cipher handle that is required for any subsequent * API invocation for that lskcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_skcipher_tfm( struct crypto_skcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_lskcipher_tfm( struct crypto_lskcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_sync_skcipher_tfm( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_tfm(&tfm->base); } /** * crypto_free_skcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_skcipher(struct crypto_skcipher *tfm) { crypto_destroy_tfm(tfm, crypto_skcipher_tfm(tfm)); } static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm) { crypto_free_skcipher(&tfm->base); } /** * crypto_free_lskcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_lskcipher(struct crypto_lskcipher *tfm) { crypto_destroy_tfm(tfm, crypto_lskcipher_tfm(tfm)); } /** * crypto_has_skcipher() - Search for the availability of an skcipher. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher * @type: specifies the type of the skcipher * @mask: specifies the mask for the skcipher * * Return: true when the skcipher is known to the kernel crypto API; false * otherwise */ int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm)); } static inline const char *crypto_lskcipher_driver_name( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_lskcipher_tfm(tfm)); } static inline struct skcipher_alg_common *crypto_skcipher_alg_common( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg_common, base); } static inline struct skcipher_alg *crypto_skcipher_alg( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg, base); } static inline struct lskcipher_alg *crypto_lskcipher_alg( struct crypto_lskcipher *tfm) { return container_of(crypto_lskcipher_tfm(tfm)->__crt_alg, struct lskcipher_alg, co.base); } /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the skcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->ivsize; } static inline unsigned int crypto_sync_skcipher_ivsize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_ivsize(&tfm->base); } /** * crypto_lskcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the lskcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_lskcipher_ivsize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.ivsize; } /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the skcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_skcipher_blocksize( struct crypto_skcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm)); } /** * crypto_lskcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the lskcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_lskcipher_blocksize( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_lskcipher_tfm(tfm)); } /** * crypto_skcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_skcipher_chunksize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->chunksize; } /** * crypto_lskcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_lskcipher_chunksize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.chunksize; } /** * crypto_skcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_skcipher_statesize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->statesize; } /** * crypto_lskcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_lskcipher_statesize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.statesize; } static inline unsigned int crypto_sync_skcipher_blocksize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_blocksize(&tfm->base); } static inline unsigned int crypto_skcipher_alignmask( struct crypto_skcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm)); } static inline unsigned int crypto_lskcipher_alignmask( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_lskcipher_tfm(tfm)); } static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm) { return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm)); } static inline void crypto_skcipher_set_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_skcipher_tfm(tfm), flags); } static inline void crypto_skcipher_clear_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_skcipher_tfm(tfm), flags); } static inline u32 crypto_sync_skcipher_get_flags( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_get_flags(&tfm->base); } static inline void crypto_sync_skcipher_set_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_set_flags(&tfm->base, flags); } static inline void crypto_sync_skcipher_clear_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_clear_flags(&tfm->base, flags); } static inline u32 crypto_lskcipher_get_flags(struct crypto_lskcipher *tfm) { return crypto_tfm_get_flags(crypto_lskcipher_tfm(tfm)); } static inline void crypto_lskcipher_set_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_lskcipher_tfm(tfm), flags); } static inline void crypto_lskcipher_clear_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_lskcipher_tfm(tfm), flags); } /** * crypto_skcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the skcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm, const u8 *key, unsigned int keylen) { return crypto_skcipher_setkey(&tfm->base, key, keylen); } /** * crypto_lskcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the lskcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); static inline unsigned int crypto_skcipher_min_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->min_keysize; } static inline unsigned int crypto_skcipher_max_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->max_keysize; } static inline unsigned int crypto_lskcipher_min_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.min_keysize; } static inline unsigned int crypto_lskcipher_max_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.max_keysize; } /** * crypto_skcipher_reqtfm() - obtain cipher handle from request * @req: skcipher_request out of which the cipher handle is to be obtained * * Return the crypto_skcipher handle when furnishing an skcipher_request * data structure. * * Return: crypto_skcipher handle */ static inline struct crypto_skcipher *crypto_skcipher_reqtfm( struct skcipher_request *req) { return __crypto_skcipher_cast(req->base.tfm); } static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm( struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); return container_of(tfm, struct crypto_sync_skcipher, base); } /** * crypto_skcipher_encrypt() - encrypt plaintext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_encrypt(struct skcipher_request *req); /** * crypto_skcipher_decrypt() - decrypt ciphertext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_decrypt(struct skcipher_request *req); /** * crypto_skcipher_export() - export partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @out: output buffer of sufficient size that can hold the state * * Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_export(struct skcipher_request *req, void *out); /** * crypto_skcipher_import() - import partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @in: buffer holding the state * * Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_import(struct skcipher_request *req, const void *in); /** * crypto_lskcipher_encrypt() - encrypt plaintext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * Encrypt plaintext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * crypto_lskcipher_decrypt() - decrypt ciphertext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * * Decrypt ciphertext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * DOC: Symmetric Key Cipher Request Handle * * The skcipher_request data structure contains all pointers to data * required for the symmetric key cipher operation. This includes the cipher * handle (which can be used by multiple skcipher_request instances), pointer * to plaintext and ciphertext, asynchronous callback function, etc. It acts * as a handle to the skcipher_request_* API calls in a similar way as * skcipher handle to the crypto_skcipher_* API calls. */ /** * crypto_skcipher_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_skcipher_reqsize(struct crypto_skcipher *tfm) { return tfm->reqsize; } /** * skcipher_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing skcipher handle in the request * data structure with a different one. */ static inline void skcipher_request_set_tfm(struct skcipher_request *req, struct crypto_skcipher *tfm) { req->base.tfm = crypto_skcipher_tfm(tfm); } static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req, struct crypto_sync_skcipher *tfm) { skcipher_request_set_tfm(req, &tfm->base); } static inline struct skcipher_request *skcipher_request_cast( struct crypto_async_request *req) { return container_of(req, struct skcipher_request, base); } /** * skcipher_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the skcipher * encrypt and decrypt API calls. During the allocation, the provided skcipher * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct skcipher_request *skcipher_request_alloc_noprof( struct crypto_skcipher *tfm, gfp_t gfp) { struct skcipher_request *req; req = kmalloc_noprof(sizeof(struct skcipher_request) + crypto_skcipher_reqsize(tfm), gfp); if (likely(req)) skcipher_request_set_tfm(req, tfm); return req; } #define skcipher_request_alloc(...) alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__)) /** * skcipher_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void skcipher_request_free(struct skcipher_request *req) { kfree_sensitive(req); } static inline void skcipher_request_zero(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); memzero_explicit(req, sizeof(*req) + crypto_skcipher_reqsize(tfm)); } /** * skcipher_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once the * cipher operation completes. * * The callback function is registered with the skcipher_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void skcipher_request_set_callback(struct skcipher_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * skcipher_request_set_crypt() - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_skcipher_ivsize * * This function allows setting of the source data and destination data * scatter / gather lists. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. */ static inline void skcipher_request_set_crypt( struct skcipher_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, void *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } #endif /* _CRYPTO_SKCIPHER_H */ |
| 6 2 1 1 1 1 1 8 5 1 1 1 7 4 1 1 1 1 5 3 1 1 1 7 2 1 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "fs.h" struct io_rename { struct file *file; int old_dfd; int new_dfd; struct delayed_filename oldpath; struct delayed_filename newpath; int flags; }; struct io_unlink { struct file *file; int dfd; int flags; struct delayed_filename filename; }; struct io_mkdir { struct file *file; int dfd; umode_t mode; struct delayed_filename filename; }; struct io_link { struct file *file; int old_dfd; int new_dfd; struct delayed_filename oldpath; struct delayed_filename newpath; int flags; }; int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); const char __user *oldf, *newf; int err; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; ren->old_dfd = READ_ONCE(sqe->fd); oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ren->new_dfd = READ_ONCE(sqe->len); ren->flags = READ_ONCE(sqe->rename_flags); err = delayed_getname(&ren->oldpath, oldf); if (unlikely(err)) return err; err = delayed_getname(&ren->newpath, newf); if (unlikely(err)) { dismiss_delayed_filename(&ren->oldpath); return err; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); CLASS(filename_complete_delayed, old)(&ren->oldpath); CLASS(filename_complete_delayed, new)(&ren->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_renameat2(ren->old_dfd, old, ren->new_dfd, new, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); dismiss_delayed_filename(&ren->oldpath); dismiss_delayed_filename(&ren->newpath); } int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); const char __user *fname; int err; if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; un->dfd = READ_ONCE(sqe->fd); un->flags = READ_ONCE(sqe->unlink_flags); if (un->flags & ~AT_REMOVEDIR) return -EINVAL; fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); err = delayed_getname(&un->filename, fname); if (unlikely(err)) return err; req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); CLASS(filename_complete_delayed, name)(&un->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = filename_rmdir(un->dfd, name); else ret = filename_unlinkat(un->dfd, name); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) { struct io_unlink *ul = io_kiocb_to_cmd(req, struct io_unlink); dismiss_delayed_filename(&ul->filename); } int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); const char __user *fname; int err; if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; mkd->dfd = READ_ONCE(sqe->fd); mkd->mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); err = delayed_getname(&mkd->filename, fname); if (unlikely(err)) return err; req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); CLASS(filename_complete_delayed, name)(&mkd->filename); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_mkdirat(mkd->dfd, name, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) { struct io_mkdir *md = io_kiocb_to_cmd(req, struct io_mkdir); dismiss_delayed_filename(&md->filename); } int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); const char __user *oldpath, *newpath; int err; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; sl->new_dfd = READ_ONCE(sqe->fd); oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); err = delayed_getname(&sl->oldpath, oldpath); if (unlikely(err)) return err; err = delayed_getname(&sl->newpath, newpath); if (unlikely(err)) { dismiss_delayed_filename(&sl->oldpath); return err; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); CLASS(filename_complete_delayed, old)(&sl->oldpath); CLASS(filename_complete_delayed, new)(&sl->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_symlinkat(old, sl->new_dfd, new); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); const char __user *oldf, *newf; int err; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; lnk->old_dfd = READ_ONCE(sqe->fd); lnk->new_dfd = READ_ONCE(sqe->len); oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); lnk->flags = READ_ONCE(sqe->hardlink_flags); err = delayed_getname_uflags(&lnk->oldpath, oldf, lnk->flags); if (unlikely(err)) return err; err = delayed_getname(&lnk->newpath, newf); if (unlikely(err)) { dismiss_delayed_filename(&lnk->oldpath); return err; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); CLASS(filename_complete_delayed, old)(&lnk->oldpath); CLASS(filename_complete_delayed, new)(&lnk->newpath); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_linkat(lnk->old_dfd, old, lnk->new_dfd, new, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) { struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); dismiss_delayed_filename(&sl->oldpath); dismiss_delayed_filename(&sl->newpath); } |
| 35 45 110 6 32 1 11 6 33 32 15 18 14 2 33 31 10 7 2 1 4 1 1 3 2 2 2 3 3 9 2 4 3 3 2 2 1 1 3 1 2 3 2 4 3 2 1 1 1 7 3 3 3 3 3 3 6 120 4 116 39 39 68 2 7 5 8 13 13 51 36 35 35 35 35 44 6 44 1 44 2 51 51 36 51 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/msdos.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug * in the early extended-partition checks and added DM partitions * * Support for DiskManager v6.0x added by Mark Lord, * with information provided by OnTrack. This now works for linux fdisk * and LILO, as well as loadlin and bootln. Note that disks other than * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). * * More flexible handling of extended partitions - aeb, 950831 * * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King * * BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il> * updated by Marc Espie <Marc.Espie@openbsd.org> * * Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl> * and Krzysztof G. Baranowski <kgb@knm.org.pl> */ #include <linux/msdos_fs.h> #include <linux/msdos_partition.h> #include "check.h" #include "efi.h" /* * Many architectures don't like unaligned accesses, while * the nr_sects and start_sect partition table entries are * at a 2 (mod 4) address. */ #include <linux/unaligned.h> static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } static inline int is_extended_partition(struct msdos_partition *p) { return (p->sys_ind == DOS_EXTENDED_PARTITION || p->sys_ind == WIN98_EXTENDED_PARTITION || p->sys_ind == LINUX_EXTENDED_PARTITION); } #define MSDOS_LABEL_MAGIC1 0x55 #define MSDOS_LABEL_MAGIC2 0xAA static inline int msdos_magic_present(unsigned char *p) { return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); } /* Value is EBCDIC 'IBMA' */ #define AIX_LABEL_MAGIC1 0xC9 #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; if (!(p[0] == AIX_LABEL_MAGIC1 && p[1] == AIX_LABEL_MAGIC2 && p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; /* * Assume the partition table is valid if Linux partitions exists. * Note that old Solaris/x86 partitions use the same indicator as * Linux swap partitions, so we consider that a Linux partition as * well. */ for (slot = 1; slot <= 4; slot++, pt++) { if (pt->sys_ind == SOLARIS_X86_PARTITION || pt->sys_ind == LINUX_RAID_PARTITION || pt->sys_ind == LINUX_DATA_PARTITION || pt->sys_ind == LINUX_LVM_PARTITION || is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); } return ret; } static void set_info(struct parsed_partitions *state, int slot, u32 disksig) { struct partition_meta_info *info = &state->parts[slot].info; snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, slot); info->volname[0] = 0; state->parts[slot].has_info = true; } /* * Create devices for each logical partition in an extended partition. * The logical partitions form a linked list, with each entry being * a partition table with two entries. The first entry * is the real data partition (with a start relative to the partition * table start). The second is a pointer to the next logical partition * (with a start relative to the entire extended partition). * We do not create a Linux partition for the partition tables, but * only for the actual data partitions. */ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; while (1) { if (++loopct > 100) return; if (state->next == state->limit) return; data = read_part_sector(state, this_sector, §); if (!data) return; if (!msdos_magic_present(data + 510)) goto done; p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, * the 2nd entry is the next extended partition, or empty, * and the 3rd and 4th entries are unused. * However, DRDOS sometimes has the extended partition as * the first entry (when the data partition is empty), * and OS/2 seems to use all four entries. */ /* * First process the data partition(s) */ for (i = 0; i < 4; i++, p++) { sector_t offs, size, next; if (!nr_sects(p) || is_extended_partition(p)) continue; /* Check the 3rd and 4th entries - these sometimes contain random garbage */ offs = start_sect(p)*sector_size; size = nr_sects(p)*sector_size; next = this_sector + offs; if (i >= 2) { if (offs + size > this_size) continue; if (next < first_sector) continue; if (next + size > first_sector + first_size) continue; } put_partition(state, state->next, next, size); set_info(state, state->next, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; if (++state->next == state->limit) goto done; } /* * Next, process the (first) extended partition, if present. * (So far, there seems to be no reason to make * parse_extended() recursive and allow a tree * of extended partitions.) * It should be a link to the next logical partition. */ p -= 4; for (i = 0; i < 4; i++, p++) if (nr_sects(p) && is_extended_partition(p)) break; if (i == 4) goto done; /* nothing left to do */ this_sector = first_sector + start_sect(p) * sector_size; this_size = nr_sects(p) * sector_size; put_dev_sector(sect); } done: put_dev_sector(sect); } #define SOLARIS_X86_NUMSLICE 16 #define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) struct solaris_x86_slice { __le16 s_tag; /* ID tag of partition */ __le16 s_flag; /* permission flags */ __le32 s_start; /* start sector no of partition */ __le32 s_size; /* # of blocks in partition */ }; struct solaris_x86_vtoc { unsigned int v_bootinfo[3]; /* info needed by mboot */ __le32 v_sanity; /* to verify vtoc sanity */ __le32 v_version; /* layout version */ char v_volume[8]; /* volume name */ __le16 v_sectorsz; /* sector size in bytes */ __le16 v_nparts; /* number of partitions */ unsigned int v_reserved[10]; /* free space */ struct solaris_x86_slice v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ char v_asciilabel[128]; /* for compatibility */ }; /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ static void parse_solaris_x86(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; struct solaris_x86_vtoc *v; int i; short max_nparts; v = read_part_sector(state, offset + 1, §); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } if (le32_to_cpu(v->v_version) != 1) { char tmp[64]; snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", le32_to_cpu(v->v_version)); strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); return; } /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; char tmp[3 + 10 + 1 + 1]; if (s->s_size == 0) continue; snprintf(tmp, sizeof(tmp), " [s%d]", i); strlcat(state->pp_buf, tmp, PAGE_SIZE); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, le32_to_cpu(s->s_start)+offset, le32_to_cpu(s->s_size)); } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } /* check against BSD src/sys/sys/disklabel.h for consistency */ #define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ #define BSD_MAXPARTITIONS 16 #define OPENBSD_MAXPARTITIONS 16 #define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ struct bsd_disklabel { __le32 d_magic; /* the magic number */ __s16 d_type; /* drive type */ __s16 d_subtype; /* controller/d_type specific */ char d_typename[16]; /* type name, e.g. "eagle" */ char d_packname[16]; /* pack identifier */ __u32 d_secsize; /* # of bytes per sector */ __u32 d_nsectors; /* # of data sectors per track */ __u32 d_ntracks; /* # of tracks per cylinder */ __u32 d_ncylinders; /* # of data cylinders per unit */ __u32 d_secpercyl; /* # of data sectors per cylinder */ __u32 d_secperunit; /* # of data sectors per unit */ __u16 d_sparespertrack; /* # of spare sectors per track */ __u16 d_sparespercyl; /* # of spare sectors per cylinder */ __u32 d_acylinders; /* # of alt. cylinders per unit */ __u16 d_rpm; /* rotational speed */ __u16 d_interleave; /* hardware sector interleave */ __u16 d_trackskew; /* sector 0 skew, per track */ __u16 d_cylskew; /* sector 0 skew, per cylinder */ __u32 d_headswitch; /* head switch time, usec */ __u32 d_trkseek; /* track-to-track seek, usec */ __u32 d_flags; /* generic flags */ #define NDDATA 5 __u32 d_drivedata[NDDATA]; /* drive-type specific information */ #define NSPARE 5 __u32 d_spare[NSPARE]; /* reserved for future use */ __le32 d_magic2; /* the magic number (again) */ __le16 d_checksum; /* xor of data incl. partitions */ /* filesystem and partition information: */ __le16 d_npartitions; /* number of partitions in following */ __le32 d_bbsize; /* size of boot area at sn0, bytes */ __le32 d_sbsize; /* max size of fs superblock, bytes */ struct bsd_partition { /* the partition table */ __le32 p_size; /* number of sectors in partition */ __le32 p_offset; /* starting sector */ __le32 p_fsize; /* filesystem basic fragment size */ __u8 p_fstype; /* filesystem type, see below */ __u8 p_frag; /* filesystem fragments per block */ __le16 p_cpg; /* filesystem cylinders per group */ } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ }; #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_bsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin, char *flavour, int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; char tmp[64]; l = read_part_sector(state, offset + 1, §); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { put_dev_sector(sect); return; } snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); strlcat(state->pp_buf, tmp, PAGE_SIZE); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { sector_t bsd_start, bsd_size; if (state->next == state->limit) break; if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); /* FreeBSD has relative offset if C partition offset is zero */ if (memcmp(flavour, "bsd\0", 4) == 0 && le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); if (le16_to_cpu(l->d_npartitions) > max_partitions) { snprintf(tmp, sizeof(tmp), " (ignored %d more)", le16_to_cpu(l->d_npartitions) - max_partitions); strlcat(state->pp_buf, tmp, PAGE_SIZE); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } #endif static void parse_freebsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } static void parse_netbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } static void parse_openbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "openbsd", OPENBSD_MAXPARTITIONS); #endif } #define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ #define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ #define UNIXWARE_NUMSLICE 16 #define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ struct unixware_slice { __le16 s_label; /* label */ __le16 s_flags; /* permission flags */ __le32 start_sect; /* starting sector */ __le32 nr_sects; /* number of sectors in slice */ }; struct unixware_disklabel { __le32 d_type; /* drive type */ __le32 d_magic; /* the magic number */ __le32 d_version; /* version number */ char d_serial[12]; /* serial number of the device */ __le32 d_ncylinders; /* # of data cylinders per device */ __le32 d_ntracks; /* # of tracks per cylinder */ __le32 d_nsectors; /* # of data sectors per track */ __le32 d_secsize; /* # of bytes per sector */ __le32 d_part_start; /* # of first sector of this partition*/ __le32 d_unknown1[12]; /* ? */ __le32 d_alt_tbl; /* byte offset of alternate table */ __le32 d_alt_len; /* byte length of alternate table */ __le32 d_phys_cyl; /* # of physical cylinders per device */ __le32 d_phys_trk; /* # of physical tracks per cylinder */ __le32 d_phys_sec; /* # of physical sectors per track */ __le32 d_phys_bytes; /* # of physical bytes per sector */ __le32 d_unknown2; /* ? */ __le32 d_unknown3; /* ? */ __le32 d_pad[8]; /* pad */ struct unixware_vtoc { __le32 v_magic; /* the magic number */ __le32 v_version; /* version number */ char v_name[8]; /* volume name */ __le16 v_nslices; /* # of slices */ __le16 v_unknown1; /* ? */ __le32 v_reserved[10]; /* reserved */ struct unixware_slice v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ } vtoc; }; /* 408 */ /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_unixware(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; l = read_part_sector(state, offset + 29, §); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { if (state->next == state->limit) break; if (p->s_label != UNIXWARE_FS_UNUSED) put_partition(state, state->next++, le32_to_cpu(p->start_sect), le32_to_cpu(p->nr_sects)); p++; } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } #define MINIX_NR_SUBPARTITIONS 4 /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ static void parse_minix(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or * the normal boot sector. */ if (msdos_magic_present(data + 510) && p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; /* add each partition in use */ if (p->sys_ind == MINIX_PARTITION) put_partition(state, state->next++, start_sect(p), nr_sects(p)); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ } static struct { unsigned char id; void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, {OPENBSD_PARTITION, parse_openbsd}, {MINIX_PARTITION, parse_minix}, {UNIXWARE_PARTITION, parse_unixware}, {SOLARIS_X86_PARTITION, parse_solaris_x86}, {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; int msdos_partition(struct parsed_partitions *state) { sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; /* * Note order! (some AIX disks, e.g. unbootable kind, * have no MSDOS 55aa) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); #ifdef CONFIG_AIX_PARTITION return aix_partition(state); #else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; #endif } if (!msdos_magic_present(data + 510)) { put_dev_sector(sect); return 0; } /* * Now that the 55aa signature is present, this is probably * either the boot sector of a FAT filesystem or a DOS-type * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } else { put_dev_sector(sect); return 0; } } } #ifdef CONFIG_EFI_PARTITION p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) { put_dev_sector(sect); return 0; } } #endif p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); /* * Look for partitions in two passes: * First find the primary and DOS-type extended partitions. * On the second pass look inside *BSD, Unixware and Solaris partitions. */ state->next = 5; for (slot = 1 ; slot <= 4 ; slot++, p++) { sector_t start = start_sect(p)*sector_size; sector_t size = nr_sects(p)*sector_size; if (!size) continue; if (is_extended_partition(p)) { /* * prevent someone doing mkfs or mkswap on an * extended partition, but leave room for LILO * FIXME: this uses one logical sector for > 512b * sector, although it may not be enough/proper. */ sector_t n = 2; n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); strlcat(state->pp_buf, " <", PAGE_SIZE); parse_extended(state, start, size, disksig); strlcat(state->pp_buf, " >", PAGE_SIZE); continue; } put_partition(state, slot, start, size); set_info(state, slot, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (p->sys_ind == DM6_PARTITION) strlcat(state->pp_buf, "[DM]", PAGE_SIZE); if (p->sys_ind == EZD_PARTITION) strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); } strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = p->sys_ind; int n; if (!nr_sects(p)) continue; for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) ; if (!subtypes[n].parse) continue; subtypes[n].parse(state, start_sect(p) * sector_size, nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | // SPDX-License-Identifier: GPL-2.0-only /* * 32-bit compatibility support for ELF format executables and core dumps. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Red Hat Author: Roland McGrath. * * This file is used in a 64-bit kernel that wants to support 32-bit ELF. * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros * used below, with definitions appropriate for 32-bit ABI compatibility. * * We use macros to rename the ABI types and machine-dependent * functions used in binfmt_elf.c to compat versions. */ #include <linux/elfcore-compat.h> #include <linux/time.h> #define ELF_COMPAT 1 /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ #undef ELF_CLASS #define ELF_CLASS ELFCLASS32 #undef elfhdr #undef elf_phdr #undef elf_shdr #undef elf_note #undef elf_addr_t #undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr #define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. */ #define user_long_t compat_long_t #define user_siginfo_t compat_siginfo_t #define copy_siginfo_to_external copy_siginfo_to_external32 /* * The machine-dependent core note format types are defined in elfcore-compat.h, * which requires asm/elf.h to define compat_elf_gregset_t et al. */ #define elf_prstatus compat_elf_prstatus #define elf_prstatus_common compat_elf_prstatus_common #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 /* * To use this file, asm/elf.h must define compat_elf_check_arch. * The other following macros can be defined if the compat versions * differ from the native ones, or omitted when they match. */ #undef elf_check_arch #define elf_check_arch compat_elf_check_arch #ifdef COMPAT_ELF_PLATFORM #undef ELF_PLATFORM #define ELF_PLATFORM COMPAT_ELF_PLATFORM #endif #ifdef COMPAT_ELF_HWCAP #undef ELF_HWCAP #define ELF_HWCAP COMPAT_ELF_HWCAP #endif #ifdef COMPAT_ELF_HWCAP2 #undef ELF_HWCAP2 #define ELF_HWCAP2 COMPAT_ELF_HWCAP2 #endif #ifdef COMPAT_ELF_HWCAP3 #undef ELF_HWCAP3 #define ELF_HWCAP3 COMPAT_ELF_HWCAP3 #endif #ifdef COMPAT_ELF_HWCAP4 #undef ELF_HWCAP4 #define ELF_HWCAP4 COMPAT_ELF_HWCAP4 #endif #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO #endif #ifdef COMPAT_ELF_ET_DYN_BASE #undef ELF_ET_DYN_BASE #define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE #endif #ifdef COMPAT_ELF_PLAT_INIT #undef ELF_PLAT_INIT #define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT #endif #ifdef COMPAT_SET_PERSONALITY #undef SET_PERSONALITY #define SET_PERSONALITY COMPAT_SET_PERSONALITY #endif #ifdef compat_start_thread #define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ compat_start_thread(regs, new_ip, new_sp) #endif #ifdef COMPAT_START_THREAD #undef START_THREAD #define START_THREAD COMPAT_START_THREAD #endif #ifdef compat_arch_setup_additional_pages #define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ compat_arch_setup_additional_pages(bprm, interpreter) #endif #ifdef COMPAT_ARCH_SETUP_ADDITIONAL_PAGES #undef ARCH_HAS_SETUP_ADDITIONAL_PAGES #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 #undef ARCH_SETUP_ADDITIONAL_PAGES #define ARCH_SETUP_ADDITIONAL_PAGES COMPAT_ARCH_SETUP_ADDITIONAL_PAGES #endif #ifdef compat_elf_read_implies_exec #undef elf_read_implies_exec #define elf_read_implies_exec compat_elf_read_implies_exec #endif /* * Rename a few of the symbols that binfmt_elf.c will define. * These are all local so the names don't really matter, but it * might make some debugging less confusing not to duplicate them. */ #define elf_format compat_elf_format #define init_elf_binfmt init_compat_elf_binfmt #define exit_elf_binfmt exit_compat_elf_binfmt #define binfmt_elf_test_cases compat_binfmt_elf_test_cases #define binfmt_elf_test_suite compat_binfmt_elf_test_suite /* * We share all the actual code with the native (64-bit) version. */ #include "binfmt_elf.c" |
| 1 2 8 5 3 34 31 1 36 6 4 4 35 35 16 16 36 36 16 10 10 36 36 36 36 20 1 7 30 30 3 5 5 32 2 32 11 15 16 16 16 32 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS recovery logic * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" #include "sufile.h" #include "page.h" #include "segbuf.h" /* * Segment check result */ enum { NILFS_SEG_VALID, NILFS_SEG_NO_SUPER_ROOT, NILFS_SEG_FAIL_IO, NILFS_SEG_FAIL_MAGIC, NILFS_SEG_FAIL_SEQ, NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, NILFS_SEG_FAIL_CHECKSUM_FULL, NILFS_SEG_FAIL_CONSISTENCY, }; /* work structure for recovery */ struct nilfs_recovery_block { ino_t ino; /* * Inode number of the file that this block * belongs to */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ struct list_head list; }; static int nilfs_warn_segment_error(struct super_block *sb, int err) { const char *msg = NULL; switch (err) { case NILFS_SEG_FAIL_IO: nilfs_err(sb, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: msg = "No super root in the last segment"; break; default: nilfs_err(sb, "unrecognized segment error %d", err); return -EINVAL; } nilfs_warn(sb, "invalid segment: %s", msg); return -EINVAL; } /** * nilfs_compute_checksum - compute checksum of blocks continuously * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block * @check_bytes: number of bytes to be checked * @start: DBN of start block * @nblock: number of blocks to be checked * * Return: 0 on success, or %-EIO if an I/O error occurs. */ static int nilfs_compute_checksum(struct the_nilfs *nilfs, struct buffer_head *bhs, u32 *sum, unsigned long offset, u64 check_bytes, sector_t start, unsigned long nblock) { unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { struct buffer_head *bh; bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; size = min_t(u64, check_bytes, blocksize); crc = crc32_le(crc, bh->b_data, size); brelse(bh); } while (--nblock > 0); } *sum = crc; return 0; } /** * nilfs_read_super_root_block - read super root block * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Super root block corrupted. * * %-EIO - I/O error. */ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; struct nilfs_super_root *sr; u32 crc; int ret; *pbh = NULL; bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; } sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } if (nilfs_compute_checksum( nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } if (crc != le32_to_cpu(sr->sr_sum)) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } } *pbh = bh_sr; return 0; failed_bh: brelse(bh_sr); failed: return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** * nilfs_read_log_header - read summary header of the specified log * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: pointer to return segment summary structure * * Return: Buffer head pointer, or NULL if an I/O error occurs. */ static struct buffer_head * nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (bh_sum) *sum = (struct nilfs_segment_summary *)bh_sum->b_data; return bh_sum; } /** * nilfs_validate_log - verify consistency of log * @nilfs: nilfs object * @seg_seq: sequence number of segment * @bh_sum: buffer head of summary block * @sum: segment summary struct * * Return: 0 on success, or one of the following internal codes on failure: * * %NILFS_SEG_FAIL_MAGIC - Magic number mismatch. * * %NILFS_SEG_FAIL_SEQ - Sequence number mismatch. * * %NIFLS_SEG_FAIL_CONSISTENCY - Block count out of range. * * %NILFS_SEG_FAIL_IO - I/O error. * * %NILFS_SEG_FAIL_CHECKSUM_FULL - Full log checksum verification failed. */ static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, struct buffer_head *bh_sum, struct nilfs_segment_summary *sum) { unsigned long nblock; u32 crc; int ret; ret = NILFS_SEG_FAIL_MAGIC; if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; ret = NILFS_SEG_FAIL_SEQ; if (le64_to_cpu(sum->ss_seq) != seg_seq) goto out; nblock = le32_to_cpu(sum->ss_nblocks); ret = NILFS_SEG_FAIL_CONSISTENCY; if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ goto out; ret = NILFS_SEG_FAIL_IO; if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), ((u64)nblock << nilfs->ns_blocksize_bits), bh_sum->b_blocknr, nblock)) goto out; ret = NILFS_SEG_FAIL_CHECKSUM_FULL; if (crc != le32_to_cpu(sum->ss_datasum)) goto out; ret = 0; out: return ret; } /** * nilfs_read_summary_info - read an item on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be read * * Return: Kernel space address of current segment summary entry, or * NULL if an I/O error occurs. */ static void *nilfs_read_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; BUG_ON((*pbh)->b_size < *offset); if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + 1, nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; } ptr = (*pbh)->b_data + *offset; *offset += bytes; return ptr; } /** * nilfs_skip_summary_info - skip items on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be skipped * @count: number of items to be skipped */ static void nilfs_skip_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes, unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; if (count <= rest_item_in_current_block) { *offset += bytes * count; } else { sector_t blocknr = (*pbh)->b_blocknr; unsigned int nitem_per_block = (*pbh)->b_size / bytes; unsigned int bcnt; count -= rest_item_in_current_block; bcnt = DIV_ROUND_UP(count, nitem_per_block); *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, nilfs->ns_blocksize); } } /** * nilfs_scan_dsync_log - get block information of a log written for data sync * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: log summary information * @head: list head to add nilfs_recovery_block struct * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary *sum, struct list_head *head) { struct buffer_head *bh; unsigned int offset; u32 nfinfo, sumbytes; sector_t blocknr; ino_t ino; int err = -EIO; nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; sumbytes = le32_to_cpu(sum->ss_sumbytes); blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; finfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*finfo)); if (unlikely(!finfo)) goto out; ino = le64_to_cpu(finfo->fi_ino); nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); nnodeblk = nblocks - ndatablk; while (ndatablk-- > 0) { struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; binfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*binfo)); if (unlikely(!binfo)) goto out; rb = kmalloc(sizeof(*rb), GFP_NOFS); if (unlikely(!rb)) { err = -ENOMEM; goto out; } rb->ino = ino; rb->blocknr = blocknr++; rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); rb->blkoff = le64_to_cpu(binfo->bi_blkoff); /* INIT_LIST_HEAD(&rb->list); */ list_add_tail(&rb->list, head); } if (--nfinfo == 0) break; blocknr += nnodeblk; /* always 0 for data sync logs */ nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), nnodeblk); if (unlikely(!bh)) goto out; } err = 0; out: brelse(bh); /* brelse(NULL) is just ignored */ return err; } static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_recovery_block *rb; rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } } struct nilfs_segment_entry { struct list_head list; __u64 segnum; }; static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) { struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); if (unlikely(!ent)) return -ENOMEM; ent->segnum = segnum; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, head); return 0; } void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_segment_entry *ent; ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } } static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct list_head *head = &ri->ri_used_segments; struct nilfs_segment_entry *ent, *n; struct inode *sufile = nilfs->ns_sufile; __u64 segnum[4]; int err; int i; segnum[0] = nilfs->ns_segnum; segnum[1] = nilfs->ns_nextnum; segnum[2] = ri->ri_segnum; segnum[3] = ri->ri_nextnum; /* * Releasing the next segment of the latest super root. * The next segment is invalidated by this recovery. */ err = nilfs_sufile_free(sufile, segnum[1]); if (unlikely(err)) { if (err == -ENOENT) { nilfs_err(sb, "checkpoint log inconsistency at block %llu (segment %llu): next segment %llu is unallocated", (unsigned long long)nilfs->ns_last_pseg, (unsigned long long)nilfs->ns_segnum, (unsigned long long)segnum[1]); err = -EINVAL; } goto failed; } for (i = 1; i < 4; i++) { err = nilfs_segment_list_add(head, segnum[i]); if (unlikely(err)) goto failed; } /* * Collecting segments written after the latest super root. * These are marked dirty to avoid being reallocated in the next write. */ list_for_each_entry_safe(ent, n, head, list) { if (ent->segnum != segnum[0]) { err = nilfs_sufile_scrap(sufile, ent->segnum); if (unlikely(err)) goto failed; } list_del(&ent->list); kfree(ent); } /* Allocate new segments for recovery */ err = nilfs_sufile_alloc(sufile, &segnum[0]); if (unlikely(err)) goto failed; nilfs->ns_pseg_offset = 0; nilfs->ns_seg_seq = ri->ri_seq + 2; nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; failed: /* No need to recover sufile because it will be destroyed on error */ return err; } static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, loff_t pos, struct folio *folio) { struct buffer_head *bh_org; size_t from = offset_in_folio(folio, pos); bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size); brelse(bh_org); return 0; } static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct list_head *head, unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct folio *folio; loff_t pos; int err = 0, err2 = 0; list_for_each_entry_safe(rb, n, head, list) { inode = nilfs_iget(sb, root, rb->ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto failed_inode; } pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); goto failed_inode; } err = nilfs_recovery_copy_block(nilfs, rb, pos, folio); if (unlikely(err)) goto failed_folio; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_folio; block_write_end(pos, blocksize, blocksize, folio); folio_unlock(folio); folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_folio: folio_unlock(folio); folio_put(folio); failed_inode: nilfs_warn(sb, "error %d recovering data block (ino=%lu, block-offset=%llu)", err, (unsigned long)rb->ino, (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: iput(inode); /* iput(NULL) is just ignored */ list_del_init(&rb->list); kfree(rb); } return err2; } /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint * @nilfs: nilfs object * @sb: super block instance * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Log format error. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; int err = 0, ret; LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ enum { RF_INIT_ST, RF_DSYNC_ST, /* scanning data-sync segments */ }; int state = RF_INIT_ST; pseg_start = ri->ri_lsegs_start; seg_seq = ri->ri_lsegs_start_seq; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { brelse(bh_sum); bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) { err = -EIO; goto failed; } ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; goto failed; } goto strayed; } flags = le16_to_cpu(sum->ss_flags); if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); empty_seg = 0; nilfs->ns_ctime = le64_to_cpu(sum->ss_create); if (!(flags & NILFS_SS_GC)) nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: if (!(flags & NILFS_SS_LOGBGN) || !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, &dsync_blocks); if (unlikely(err)) goto failed; if (flags & NILFS_SS_LOGEND) { err = nilfs_recover_dsync_blocks( nilfs, sb, root, &dsync_blocks, &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; } break; /* Fall through to try_next_pseg */ } try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; strayed: if (pseg_start == ri->ri_lsegs_end) break; feed_segment: /* Looking to the next full segment */ if (empty_seg++) break; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } if (nsalvaged_blocks) { nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: brelse(bh_sum); dispose_recovery_list(&dsync_blocks); return err; confused: err = -EINVAL; failed: nilfs_err(sb, "error %d roll-forwarding partial segment at blocknr = %llu", err, (unsigned long long)pseg_start); goto out; } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh; int err; if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); if (WARN_ON(!bh)) return; /* should never happen */ lock_buffer(bh); memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) nilfs_warn(nilfs->ns_sb, "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } /** * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery * @nilfs: nilfs object */ static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; LIST_HEAD(head); /* Abandon inodes that have read recovery data */ spin_lock(&nilfs->ns_inode_lock); list_splice_init(&nilfs->ns_dirty_files, &head); spin_unlock(&nilfs->ns_inode_lock); if (list_empty(&head)) return; set_nilfs_purging(nilfs); list_for_each_entry_safe(ii, n, &head, i_dirty) { spin_lock(&nilfs->ns_inode_lock); list_del_init(&ii->i_dirty); spin_unlock(&nilfs->ns_inode_lock); iput(&ii->vfs_inode); } clear_nilfs_purging(nilfs); } /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance * @ri: pointer to a nilfs_recovery_info struct to store search results. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Inconsistent filesystem state. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. */ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct nilfs_root *root; int err; if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) return 0; err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { nilfs_err(sb, "error %d loading the latest checkpoint", err); return err; } err = nilfs_do_roll_forward(nilfs, sb, root, ri); if (unlikely(err)) goto failed; if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { nilfs_err(sb, "error %d preparing segment for recovery", err); goto failed; } err = nilfs_attach_log_writer(sb, root); if (unlikely(err)) goto failed; set_nilfs_discontinued(nilfs); err = nilfs_construct_segment(sb); nilfs_detach_log_writer(sb); if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } put_root: nilfs_put_root(root); return err; failed: nilfs_abort_roll_forward(nilfs); goto put_root; } /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial * segment pointed by the superblock. It sets up struct the_nilfs through * this search. It fills nilfs_recovery_info (ri) required for recovery. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - No valid segment found. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; unsigned long nblocks; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; LIST_HEAD(segments); int empty_seg = 0, scan_newer = 0; int ret; pseg_start = nilfs->ns_last_pseg; seg_seq = nilfs->ns_last_seq; cno = nilfs->ns_last_cno; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); /* Read ahead segment */ b = seg_start; while (b <= seg_end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { brelse(bh_sum); ret = NILFS_SEG_FAIL_IO; bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) goto failed; ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } nblocks = le32_to_cpu(sum->ss_nblocks); pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; } /* A valid partial segment */ ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { /* * This will never happen because a superblock * (last_segment) always points to a pseg with * a super root. */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); } if (!(flags & NILFS_SS_SR)) { if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } /* A valid super root was found. */ ri->ri_cno = cno++; ri->ri_super_root = pseg_end; ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); sr_pseg_start = pseg_start; nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; else { if (nilfs->ns_mount_state & NILFS_VALID_FS) goto super_root_found; scan_newer = 1; } try_next_pseg: /* Standing on a course, or met an inconsistent state */ pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; strayed: /* Off the trail */ if (!scan_newer) /* * This can happen if a checkpoint was written without * barriers, or as a result of an I/O failure. */ goto failed; feed_segment: /* Looking to the next full segment */ if (empty_seg++) goto super_root_found; /* found a valid super root */ ret = nilfs_segment_list_add(&segments, segnum); if (unlikely(ret)) goto failed; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } super_root_found: /* Updating pointers relating to the latest checkpoint */ brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; return 0; failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); } |
| 33 67 2 25 4 65 90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * ocfs2_fs.h * * On-disk structures for OCFS2. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H #include <linux/magic.h> /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 /* * An OCFS2 volume starts this way: * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. * * All other structures are found from the superblock information. * * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a * blocksize of 2K, it is 4096 bytes into disk. */ #define OCFS2_SUPER_BLOCK_BLKNO 2 /* * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could * grow if needed. */ #define OCFS2_MIN_CLUSTERSIZE 4096 #define OCFS2_MAX_CLUSTERSIZE 1048576 /* * Blocks cannot be bigger than clusters, so the maximum blocksize is the * minimum cluster size. */ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" #define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" #define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" #define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_compat & (mask) ) #define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) #define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) #define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat |= (mask) #define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat |= (mask) #define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat |= (mask) #define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat &= ~(mask) #define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \ | OCFS2_FEATURE_COMPAT_JBD2_SB) #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The * filesystem driver can't load them, but the library can. Never put * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. */ #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 /* * tunefs sets this incompat flag before starting the resize and clears it * at the end. This flag protects users from inadvertently mounting the fs * after an aborted run without fsck-ing. */ #define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 /* Used to denote a non-clustered volume */ #define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 /* Support for sparse allocation in b-trees */ #define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 /* * Tunefs sets this incompat flag before starting an operation which * would require cleanup on abort. This is done to protect users from * inadvertently mounting the fs after an aborted run without * fsck-ing. * * s_tunefs_flags on the super block describes precisely which * operations were in progress. */ #define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020 /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 /* Support for the extended slot map */ #define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 /* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 /* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* * Append Direct IO support */ #define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 /* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 /* * The filesystem will correctly handle journal feature bits. */ #define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002 /* * Unwritten extents support. */ #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 /* * Maintain quota information for this filesystem */ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ #define OCFS2_BACKUP_SB_START 1 << 30 /* the max backup superblock nums */ #define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 /* * Flags on ocfs2_super_block.s_tunefs_flags */ #define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */ /* * Flags on ocfs2_dinode.i_flags */ #define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ #define OCFS2_UNUSED2_FL (0x00000002) #define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ #define OCFS2_UNUSED3_FL (0x00000008) /* System inode flags */ #define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ #define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ #define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ #define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ #define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ #define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features * * These can change much more often than i_flags. When adding flags, * keep in mind that i_dyn_features is only 16 bits wide. */ #define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) #define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ #define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ #define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ #define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ #define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ #define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ #define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ #define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ /* Reserved for compression usage... */ #define OCFS2_DIRTY_FL FS_DIRTY_FL #define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ #define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ #define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ /* End compression flags --- maybe not all used */ #define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ #define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ #define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ #define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ #define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ #define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ #define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ #define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ #define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ #define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ /* * Extent record flags (e_node.leaf.flags) */ #define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but * unwritten */ #define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference * counted in an associated * refcount tree */ /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ /* * superblock s_state flags */ #define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ /* Limit of space in ocfs2_dir_entry */ #define OCFS2_MAX_FILENAME_LEN 255 /* Maximum slots on an ocfs2 file system */ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ #define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 /* The cluster stack fields */ #define OCFS2_STACK_LABEL_LEN 4 #define OCFS2_CLUSTER_NAME_LEN 16 /* Classic (historically speaking) cluster stack */ #define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) /* * Inline extended attribute size (in bytes) * The value chosen should be aligned to 16 byte boundaries. */ #define OCFS2_MIN_XATTR_INLINE_SIZE 256 /* * Cluster info flags (ocfs2_cluster_info.ci_stackflags) */ #define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) struct ocfs2_system_inode_info { char *si_name; int si_iflags; int si_mode; }; /* System file index */ enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, #define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE #define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, JOURNAL_SYSTEM_INODE, LOCAL_ALLOC_SYSTEM_INODE, TRUNCATE_LOG_SYSTEM_INODE, LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; #define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Global system inodes (single copy) */ /* The first two are only used from userspace mfks/tunefs */ [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, /* These are used by the running filesystem */ [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, /* Slot-specific system inodes (one copy per slot) */ [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, }; /* Parameter passed from mount.ocfs2 to module */ #define OCFS2_HB_NONE "heartbeat=none" #define OCFS2_HB_LOCAL "heartbeat=local" #define OCFS2_HB_GLOBAL "heartbeat=global" /* * OCFS2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define OCFS2_DIR_PAD 4 #define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) #define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ OCFS2_DIR_ROUND) & \ ~OCFS2_DIR_ROUND) #define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) #define OCFS2_LINK_MAX 32000 #define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) /* * Convenience casts */ #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) /* * Block checking structure. This is used in metadata to validate the * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all * zeros. */ struct ocfs2_block_check { /*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ __le16 bc_ecc; /* Single-error-correction parity vector. This is a simple Hamming code dependent on the blocksize. OCFS2's maximum blocksize, 4K, requires 16 parity bits, so we fit in __le16. */ __le16 bc_reserved1; /*08*/ }; /* * On disk extent record for OCFS2 * It describes a range of clusters on disk. * * Length fields are divided into interior and leaf node versions. * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. */ struct ocfs2_extent_rec { /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ union { __le32 e_int_clusters; /* Clusters covered by all children */ struct { __le16 e_leaf_clusters; /* Clusters covered by this extent */ __u8 e_reserved1; __u8 e_flags; /* Extent flags */ }; }; __le64 e_blkno; /* Physical disk offset, in blocks */ /*10*/ }; struct ocfs2_chain_rec { __le32 c_free; /* Number of free bits in this chain. */ __le32 c_total; /* Number of total bits in this chain */ __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ }; struct ocfs2_truncate_rec { __le32 t_start; /* 1st cluster in this log */ __le32 t_clusters; /* Number of total clusters covered */ }; /* * On disk extent list for OCFS2 (node in the tree). Note that this * is contained inside ocfs2_dinode or ocfs2_extent_block, so the * offsets are relative to ocfs2_dinode.id2.i_list or * ocfs2_extent_block.h_list, respectively. */ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this header (a leaf) NOTE: The high 8 bits cannot be used - tree_depth is never that big. */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ /* Extent records */ /*10*/ struct ocfs2_extent_rec l_recs[] __counted_by_le(l_count); }; /* * On disk allocation chain list for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_chain. */ struct ocfs2_chain_list { /*00*/ __le16 cl_cpg; /* Clusters per Block Group */ __le16 cl_bpc; /* Bits per cluster */ __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; /* Chain records */ /*10*/ struct ocfs2_chain_rec cl_recs[] __counted_by_le(cl_count); }; /* * On disk deallocation log for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_dealloc. */ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; /* Truncate records */ /*08*/ struct ocfs2_truncate_rec tl_recs[] __counted_by_le(tl_count); }; /* * On disk extent block (indirect block) for OCFS2 */ struct ocfs2_extent_block { /*00*/ __u8 h_signature[8]; /* Signature for verification */ struct ocfs2_block_check h_check; /* Error checking */ /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this extent_header belongs to */ __le16 h_suballoc_bit; /* Bit offset in suballocator block group */ __le32 h_fs_generation; /* Must match super block */ __le64 h_blkno; /* Offset on disk, in blocks */ /*20*/ __le64 h_suballoc_loc; /* Suballocator block group this eb belongs to. Only valid if allocated from a discontiguous block group */ __le64 h_next_leaf_blk; /* Offset on disk, in blocks, of next leaf header pointing to data */ /*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ /* Actual on-disk size is one block */ }; /* * On disk slot map for OCFS2. This defines the contents of the "slot_map" * system file. A slot is valid if it contains a node number >= 0. The * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { /*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. */ }; struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; /*08*/ }; /* * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP * is set. It separates out the valid marker from the node number, and * has room to grow. Unlike the old slot map, this format is defined by * i_size. */ struct ocfs2_slot_map_extended { /*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) */ }; /* * ci_stackflags is only valid if the incompat bit * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. */ struct ocfs2_cluster_info { /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; union { __le32 ci_reserved; struct { __u8 ci_stackflags; __u8 ci_reserved1; __u8 ci_reserved2; __u8 ci_reserved3; }; }; /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; /*18*/ }; /* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. */ struct ocfs2_super_block { /*00*/ __le16 s_major_rev_level; __le16 s_minor_rev_level; __le16 s_mnt_count; __le16 s_max_mnt_count; __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le32 s_checkinterval; /* Max time between checks */ /*10*/ __le64 s_lastcheck; /* Time of last check */ __le32 s_creator_os; /* OS */ __le32 s_feature_compat; /* Compatible feature set */ /*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ __le64 s_root_blkno; /* Offset, in blocks, of root directory dinode */ /*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system directory dinode */ __le32 s_blocksize_bits; /* Blocksize for this fs */ __le32 s_clustersize_bits; /* Clustersize for this fs */ /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either userspace or clusterinfo INCOMPAT flag set. */ /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ /*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* * NOTE: As stated above, all offsets are relative to * ocfs2_dinode.id2, which is at 0xC0 in the inode. * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within * our smallest blocksize, which is 512 bytes. To ensure this, * we reserve the space in s_reserved2. Anything past s_reserved2 * will not be available on the smallest blocksize. */ }; /* * Local allocation bitmap for OCFS2 slots * Note that it exists inside an ocfs2_dinode, so all offsets are * relative to the start of ocfs2_dinode.id2. */ struct ocfs2_local_alloc { /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; /*10*/ __u8 la_bitmap[] __counted_by_le(la_size); }; /* * Data-in-inode header. This is only used if i_dyn_features has * OCFS2_INLINE_DATA_FL set. */ struct ocfs2_inline_data { /*00*/ __le16 id_count; /* Number of bytes that can be used * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; __u8 id_data[] __counted_by_le(id_count); /* Start of user data */ }; /* * On disk inode for OCFS2 */ struct ocfs2_dinode { /*00*/ __u8 i_signature[8]; /* Signature for validation */ __le32 i_generation; /* Generation number */ __le16 i_suballoc_slot; /* Slot suballocator this inode belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ /*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ /*20*/ __le64 i_size; /* Size in bytes */ __le16 i_mode; /* File mode */ __le16 i_links_count; /* Links count */ __le32 i_flags; /* File flags */ /*30*/ __le64 i_atime; /* Access time */ __le64 i_ctime; /* Creation time */ /*40*/ __le64 i_mtime; /* Modification time */ __le64 i_dtime; /* Deletion time */ /*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ __le64 i_last_eb_blk; /* Pointer to last extent block */ /*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; /*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ /*90*/ __le64 i_refcount_loc; __le64 i_suballoc_loc; /* Suballocator block group this inode belongs to. Only valid if allocated from a discontiguous block group */ /*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ __le16 i_reserved1[3]; __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ struct { __le64 i_rdev; /* Device number */ } dev1; struct { /* Info for bitmap system inodes */ __le32 i_used; /* Bits (ie, clusters) used */ __le32 i_total; /* Total bits (clusters) available */ } bitmap1; struct { /* Info for journal system inodes */ __le32 ij_flags; /* Mounted, version, etc. */ __le32 ij_recovery_generation; /* Incremented when the journal is recovered after an unclean shutdown */ } journal1; } id1; /* Inode type dependent 1 */ /*C0*/ union { struct ocfs2_super_block i_super; struct ocfs2_local_alloc i_lab; struct ocfs2_chain_list i_chain; struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; /* * On-disk directory entry structure for OCFS2 * * Packed as this structure could be accessed unaligned on 64-bit platforms */ struct ocfs2_dir_entry { /*00*/ __le64 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ /* Actual on-disk length specified by rec_len */ } __attribute__ ((packed)); /* * Per-block record for the unindexed directory btree. This is carefully * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are * mirrored. That way, the directory manipulation code needs a minimal amount * of update. * * NOTE: Keep this structure aligned to a multiple of 4 bytes. */ struct ocfs2_dir_block_trailer { /*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ __le16 db_compat_rec_len; /* Backwards compatible with * ocfs2_dir_entry. */ __u8 db_compat_name_len; /* Always zero. Was name_len */ __u8 db_reserved0; __le16 db_reserved1; __le16 db_free_rec_len; /* Size of largest empty hole * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; /*20*/ __le64 db_free_next; /* Next block in list (unused) */ __le64 db_blkno; /* Offset on disk, in blocks */ /*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; /* * A directory entry in the indexed tree. We don't store the full name here, * but instead provide a pointer to the full dirent in the unindexed tree. * * We also store name_len here so as to reduce the number of leaf blocks we * need to search in case of collisions. */ struct ocfs2_dx_entry { __le32 dx_major_hash; /* Used to find logical * cluster in index */ __le32 dx_minor_hash; /* Lower bits used to find * block in cluster */ __le64 dx_dirent_blk; /* Physical block in unindexed * tree holding this dirent. */ }; struct ocfs2_dx_entry_list { __le32 de_reserved; __le16 de_count; /* Maximum number of entries * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ /* Indexed dir entries in a packed * array of length de_num_used. */ struct ocfs2_dx_entry de_entries[] __counted_by_le(de_count); }; #define OCFS2_DX_FLAG_INLINE 0x01 /* * A directory indexing block. Each indexed directory has one of these, * pointed to by ocfs2_dinode. * * This block stores an indexed btree root, and a set of free space * start-of-list pointers. */ struct ocfs2_dx_root_block { __u8 dr_signature[8]; /* Signature for verification */ struct ocfs2_block_check dr_check; /* Error checking */ __le16 dr_suballoc_slot; /* Slot suballocator this * block belongs to. */ __le16 dr_suballoc_bit; /* Bit offset in suballocator * block group */ __le32 dr_fs_generation; /* Must match super block */ __le64 dr_blkno; /* Offset on disk, in blocks */ __le64 dr_last_eb_blk; /* Pointer to last * extent block */ __le32 dr_clusters; /* Clusters allocated * to the indexed tree. */ __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ __u8 dr_reserved0; __le16 dr_reserved1; __le64 dr_dir_blkno; /* Pointer to parent inode */ __le32 dr_num_entries; /* Total number of * names stored in * this directory.*/ __le32 dr_reserved2; __le64 dr_free_blk; /* Pointer to head of free * unindexed block list. */ __le64 dr_suballoc_loc; /* Suballocator block group this root belongs to. Only valid if allocated from a discontiguous block group */ __le64 dr_reserved3[14]; union { struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 * bits for maximum space * efficiency. */ struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of * entries. We grow out * to extents if this * gets too big. */ }; }; /* * The header of a leaf block in the indexed tree. */ struct ocfs2_dx_leaf { __u8 dl_signature[8];/* Signature for verification */ struct ocfs2_block_check dl_check; /* Error checking */ __le64 dl_blkno; /* Offset on disk, in blocks */ __le32 dl_fs_generation;/* Must match super block */ __le32 dl_reserved0; __le64 dl_reserved1; struct ocfs2_dx_entry_list dl_list; }; /* * Largest bitmap for a block (suballocator) group in bytes. This limit * does not affect cluster groups (global allocator). Cluster group * bitmaps run to the end of the block. */ #define OCFS2_MAX_BG_BITMAP_SIZE 256 /* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc { /*00*/ __u8 bg_signature[8]; /* Signature for validation */ __le16 bg_size; /* Size of included bitmap in bytes. */ __le16 bg_bits; /* Bits represented by this group. */ __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; __le16 bg_contig_free_bits; /* max contig free bits length */ __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in blocks */ __le64 bg_blkno; /* Offset on disk, in blocks */ /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero * bg_list->l_next_free_rec. Only block groups * can be discontiguous; Cluster groups cannot. * We've never made a block group with more than * 2048 blocks (256 bytes of bg_bitmap). This * codifies that limit so that we can fit bg_list. * bg_size of a discontiguous block group will * be 256 to match bg_bitmap_filler. */ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; /*140*/ struct ocfs2_extent_list bg_list; }; }; /* Actual on-disk size is one block */ }; struct ocfs2_refcount_rec { /*00*/ __le64 r_cpos; /* Physical offset, in clusters */ __le32 r_clusters; /* Clusters covered by this extent */ __le32 r_refcount; /* Reference count of this extent */ /*10*/ }; #define OCFS2_32BIT_POS_MASK (0xffffffffULL) #define OCFS2_REFCOUNT_LEAF_FL (0x00000001) #define OCFS2_REFCOUNT_TREE_FL (0x00000002) struct ocfs2_refcount_list { /*00*/ __le16 rl_count; /* Maximum number of entries possible in rl_records */ __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ /* Refcount records */ /*10*/ struct ocfs2_refcount_rec rl_recs[] __counted_by_le(rl_count); }; struct ocfs2_refcount_block { /*00*/ __u8 rf_signature[8]; /* Signature for verification */ __le16 rf_suballoc_slot; /* Slot suballocator this block belongs to */ __le16 rf_suballoc_bit; /* Bit offset in suballocator block group */ __le32 rf_fs_generation; /* Must match superblock */ /*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ __le64 rf_parent; /* Parent block, only valid if OCFS2_REFCOUNT_LEAF_FL is set in rf_flags */ /*20*/ struct ocfs2_block_check rf_check; /* Error checking */ __le64 rf_last_eb_blk; /* Pointer to last extent block */ /*30*/ __le32 rf_count; /* Number of inodes sharing this refcount tree */ __le32 rf_flags; /* See the flags above */ __le32 rf_clusters; /* clusters covered by refcount tree. */ __le32 rf_cpos; /* cluster offset in refcount tree.*/ /*40*/ __le32 rf_generation; /* generation number. all be the same * for the same refcount tree. */ __le32 rf_reserved0; __le64 rf_suballoc_loc; /* Suballocator block group this refcount block belongs to. Only valid if allocated from a discontiguous block group */ /*50*/ __le64 rf_reserved1[6]; /*80*/ union { struct ocfs2_refcount_list rf_records; /* List of refcount records */ struct ocfs2_extent_list rf_list; /* Extent record list, only valid if OCFS2_REFCOUNT_TREE_FL is set in rf_flags */ }; /* Actual on-disk size is one block */ }; /* * On disk extended attribute structure for OCFS2. */ /* * ocfs2_xattr_entry indicates one extend attribute. * * Note that it can be stored in inode, one block or one xattr bucket. */ struct ocfs2_xattr_entry { __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ __le16 xe_name_offset; /* byte offset from the 1st entry in the local xattr storage(inode, xattr block or xattr bucket). */ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ __u8 xe_type; /* the low 7 bits indicate the name prefix * type and the highest bit indicates whether * the EA is stored in the local storage. */ __le64 xe_value_size; /* real xattr value length. */ }; /* * On disk structure for xattr header. * * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in * the local xattr storage. */ struct ocfs2_xattr_header { __le16 xh_count; /* contains the count of how many records are in the local xattr storage. */ __le16 xh_free_start; /* current offset for storing xattr. */ __le16 xh_name_value_len; /* total length of name/value length in this bucket. */ __le16 xh_num_buckets; /* Number of xattr buckets in this extent record, only valid in the first bucket. */ struct ocfs2_block_check xh_check; /* Error checking (Note, this is only used for xattr buckets. A block uses xb_check and sets this field to zero.) */ /* xattr entry list. */ struct ocfs2_xattr_entry xh_entries[] __counted_by_le(xh_count); }; /* * On disk structure for xattr value root. * * When an xattr's value is large enough, it is stored in an external * b-tree like file data. The xattr value root points to this structure. */ struct ocfs2_xattr_value_root { /*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ __le32 xr_reserved0; __le64 xr_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ }; /* * On disk structure for xattr tree root. * * It is used when there are too many extended attributes for one file. These * attributes will be organized and stored in an indexed-btree. */ struct ocfs2_xattr_tree_root { /*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ __le32 xt_reserved0; __le64 xt_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ }; #define OCFS2_XATTR_INDEXED 0x1 #define OCFS2_HASH_SHIFT 5 #define OCFS2_XATTR_ROUND 3 #define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ ~(OCFS2_XATTR_ROUND)) #define OCFS2_XATTR_BUCKET_SIZE 4096 #define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ / OCFS2_MIN_BLOCKSIZE) /* * On disk structure for xattr block. */ struct ocfs2_xattr_block { /*00*/ __u8 xb_signature[8]; /* Signature for verification */ __le16 xb_suballoc_slot; /* Slot suballocator this block belongs to. */ __le16 xb_suballoc_bit; /* Bit offset in suballocator block group */ __le32 xb_fs_generation; /* Must match super block */ /*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ struct ocfs2_block_check xb_check; /* Error checking */ /*20*/ __le16 xb_flags; /* Indicates whether this block contains real xattr or a xattr tree. */ __le16 xb_reserved0; __le32 xb_reserved1; __le64 xb_suballoc_loc; /* Suballocator block group this xattr block belongs to. Only valid if allocated from a discontiguous block group */ /*30*/ union { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this block contains xattr tree. */ } xb_attrs; }; #define OCFS2_XATTR_ENTRY_LOCAL 0x80 #define OCFS2_XATTR_TYPE_MASK 0x7F static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, int local) { if (local) xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; else xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; } static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; } static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) { xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; } static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_TYPE_MASK; } /* * On disk structures for global quota file */ /* Magic numbers and known versions for global quota files */ #define OCFS2_GLOBAL_QMAGICS {\ 0x0cf52470, /* USRQUOTA */ \ 0x0cf52471 /* GRPQUOTA */ \ } #define OCFS2_GLOBAL_QVERSIONS {\ 0, \ 0, \ } /* Each block of each quota file has a certain fixed number of bytes reserved * for OCFS2 internal use at its end. OCFS2 can use it for things like * checksums, etc. */ #define OCFS2_QBLK_RESERVED_SPACE 8 /* Generic header of all quota files */ struct ocfs2_disk_dqheader { __le32 dqh_magic; /* Magic number identifying file */ __le32 dqh_version; /* Quota format version */ }; #define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of global quota file (immediately follows the generic * header) */ struct ocfs2_global_disk_dqinfo { /*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ __le32 dqi_igrace; /* Grace time for inode softlimit excess */ __le32 dqi_syncms; /* Time after which we sync local changes to * global quota file */ __le32 dqi_blocks; /* Number of blocks in quota file */ /*10*/ __le32 dqi_free_blk; /* First free block in quota file */ __le32 dqi_free_entry; /* First block with free dquot entry in quota * file */ }; /* Structure with global user / group information. We reserve some space * for future use. */ struct ocfs2_global_disk_dqblk { /*00*/ __le32 dqb_id; /* ID the structure belongs to */ __le32 dqb_use_count; /* Number of nodes having reference to this structure */ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ /*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ __le64 dqb_curinodes; /* current # allocated inodes */ /*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ /*30*/ __le64 dqb_curspace; /* current space occupied */ __le64 dqb_btime; /* time limit for excessive disk use */ /*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ __le64 dqb_pad1; /*50*/ __le64 dqb_pad2; }; /* * On-disk structures for local quota file */ /* Magic numbers and known versions for local quota files */ #define OCFS2_LOCAL_QMAGICS {\ 0x0cf524c0, /* USRQUOTA */ \ 0x0cf524c1 /* GRPQUOTA */ \ } #define OCFS2_LOCAL_QVERSIONS {\ 0, \ 0, \ } /* Quota flags in dqinfo header */ #define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ * quota has been cleanly turned off) */ #define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of local quota file (immediately follows the generic * header) */ struct ocfs2_local_disk_dqinfo { __le32 dqi_flags; /* Flags for quota file */ __le32 dqi_chunks; /* Number of chunks of quota structures * with a bitmap */ __le32 dqi_blocks; /* Number of blocks allocated for quota file */ }; /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; /* One entry in local quota file */ struct ocfs2_local_disk_dqblk { /*00*/ __le64 dqb_id; /* id this quota applies to */ __le64 dqb_spacemod; /* Change in the amount of used space */ /*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ }; /* * The quota trailer lives at the end of each quota block. */ struct ocfs2_disk_dqtrailer { /*00*/ struct ocfs2_block_check dq_check; /* Error checking */ /*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ }; static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, void *buf) { char *ptr = buf; ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; return (struct ocfs2_disk_dqtrailer *)ptr; } #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - xattrsize; else return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_inode_with_xattr( struct super_block *sb, struct ocfs2_dinode *di) { int size; unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - xattrsize; else size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline int ocfs2_dx_entries_per_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline u16 ocfs2_local_alloc_size(struct super_block *sb) { u16 size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(struct super_block *sb, int suballocator, u32 feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) { u64 offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset >>= sb->s_blocksize_bits; return offset; } return 0; } static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); return size / sizeof(struct ocfs2_refcount_rec); } static inline u32 ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) { return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; } #else static inline int ocfs2_fast_symlink_chars(int blocksize) { return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(int blocksize, struct ocfs2_dinode *di) { if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - di->i_xattr_inline_size; else return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline int ocfs2_extent_recs_per_eb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_gd(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_local_alloc_size(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(int blocksize, int suballocator, uint32_t feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) { uint64_t offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset /= blocksize; return offset; } return 0; } static inline int ocfs2_xattr_recs_per_xb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } #endif /* __KERNEL__ */ static inline int ocfs2_system_inode_is_global(int type) { return ((type >= 0) && (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); } static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, int type, int slot) { int chars; /* * Global system inodes can only have one copy. Everything * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode * list has a copy per slot. */ if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) chars = snprintf(buf, len, "%s", ocfs2_system_inodes[type].si_name); else chars = snprintf(buf, len, ocfs2_system_inodes[type].si_name, slot); return chars; } static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) { if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + le16_to_cpu(gd->bg_size)) != offsetof(struct ocfs2_group_desc, bg_list)) return 0; /* * Only valid to check l_next_free_rec if * bg_bitmap + bg_size == bg_list. */ if (!gd->bg_list.l_next_free_rec) return 0; return 1; } #endif /* _OCFS2_FS_H */ |
| 1 4 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/seq_file.h> /* We are an ethernet device */ #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/uaccess.h> #include <asm/byteorder.h> #include <net/checksum.h> /* for ip_fast_csum() */ #include <net/arp.h> #include <net/dst.h> #include <linux/proc_fs.h> /* And atm device */ #include <linux/atmdev.h> #include <linux/atmlec.h> #include <linux/atmmpc.h> /* Modular too */ #include <linux/module.h> #include "lec.h" #include "mpc.h" #include "resources.h" /* * mpc.c: Implementation of MPOA client kernel part */ #if 0 #define dprintk(format, args...) \ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args) #define dprintk_cont(format, args...) printk(KERN_CONT format, ##args) #else #define dprintk(format, args...) \ do { if (0) \ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\ } while (0) #define dprintk_cont(format, args...) \ do { if (0) printk(KERN_CONT format, ##args); } while (0) #endif #if 0 #define ddprintk(format, args...) \ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args) #define ddprintk_cont(format, args...) printk(KERN_CONT format, ##args) #else #define ddprintk(format, args...) \ do { if (0) \ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\ } while (0) #define ddprintk_cont(format, args...) \ do { if (0) printk(KERN_CONT format, ##args); } while (0) #endif /* mpc_daemon -> kernel */ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc); static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc); static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); static void mps_death(struct k_message *msg, struct mpoa_client *mpc); static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action); static void MPOA_cache_impos_rcvd(struct k_message *msg, struct mpoa_client *mpc); static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc); static void set_mps_mac_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc); static const uint8_t *copy_macs(struct mpoa_client *mpc, const uint8_t *router_mac, const uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type); static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry); static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc); static void mpoad_close(struct atm_vcc *vcc); static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb); static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb); static netdev_tx_t mpc_send_packet(struct sk_buff *skb, struct net_device *dev); static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev); static void mpc_timer_refresh(void); static void mpc_cache_check(struct timer_list *unused); static struct llc_snap_hdr llc_snap_mpoa_ctrl = { 0xaa, 0xaa, 0x03, {0x00, 0x00, 0x5e}, {0x00, 0x03} /* For MPOA control PDUs */ }; static struct llc_snap_hdr llc_snap_mpoa_data = { 0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x08, 0x00} /* This is for IP PDUs only */ }; static struct llc_snap_hdr llc_snap_mpoa_data_tagged = { 0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c} /* This is for tagged data PDUs */ }; static struct notifier_block mpoa_notifier = { mpoa_event_listener, NULL, 0 }; struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; static DEFINE_TIMER(mpc_timer, mpc_cache_check); static struct mpoa_client *find_mpc_by_itfnum(int itf) { struct mpoa_client *mpc; mpc = mpcs; /* our global linked list */ while (mpc != NULL) { if (mpc->dev_num == itf) return mpc; mpc = mpc->next; } return NULL; /* not found */ } static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc) { struct mpoa_client *mpc; mpc = mpcs; /* our global linked list */ while (mpc != NULL) { if (mpc->mpoad_vcc == vcc) return mpc; mpc = mpc->next; } return NULL; /* not found */ } static struct mpoa_client *find_mpc_by_lec(struct net_device *dev) { struct mpoa_client *mpc; mpc = mpcs; /* our global linked list */ while (mpc != NULL) { if (mpc->dev == dev) return mpc; mpc = mpc->next; } return NULL; /* not found */ } /* * Functions for managing QoS list */ /* * Overwrites the old entry or makes a new one. */ struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos) { struct atm_mpoa_qos *entry; entry = atm_mpoa_search_qos(dst_ip); if (entry != NULL) { entry->qos = *qos; return entry; } entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL); if (entry == NULL) { pr_info("mpoa: out of memory\n"); return entry; } entry->ipaddr = dst_ip; entry->qos = *qos; entry->next = qos_head; qos_head = entry; return entry; } struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip) { struct atm_mpoa_qos *qos; qos = qos_head; while (qos) { if (qos->ipaddr == dst_ip) break; qos = qos->next; } return qos; } /* * Returns 0 for failure */ int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry) { struct atm_mpoa_qos *curr; if (entry == NULL) return 0; if (entry == qos_head) { qos_head = qos_head->next; kfree(entry); return 1; } curr = qos_head; while (curr != NULL) { if (curr->next == entry) { curr->next = entry->next; kfree(entry); return 1; } curr = curr->next; } return 0; } /* this is buggered - we need locking for qos_head */ void atm_mpoa_disp_qos(struct seq_file *m) { struct atm_mpoa_qos *qos; qos = qos_head; seq_printf(m, "QoS entries for shortcuts:\n"); seq_printf(m, "IP address\n TX:max_pcr pcr min_pcr max_cdv max_sdu\n RX:max_pcr pcr min_pcr max_cdv max_sdu\n"); while (qos != NULL) { seq_printf(m, "%pI4\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n", &qos->ipaddr, qos->qos.txtp.max_pcr, qos->qos.txtp.pcr, qos->qos.txtp.min_pcr, qos->qos.txtp.max_cdv, qos->qos.txtp.max_sdu, qos->qos.rxtp.max_pcr, qos->qos.rxtp.pcr, qos->qos.rxtp.min_pcr, qos->qos.rxtp.max_cdv, qos->qos.rxtp.max_sdu); qos = qos->next; } } static struct net_device *find_lec_by_itfnum(int itf) { struct net_device *dev; char name[IFNAMSIZ]; sprintf(name, "lec%d", itf); dev = dev_get_by_name(&init_net, name); return dev; } static struct mpoa_client *alloc_mpc(void) { struct mpoa_client *mpc; mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL); if (mpc == NULL) return NULL; rwlock_init(&mpc->ingress_lock); rwlock_init(&mpc->egress_lock); mpc->next = mpcs; atm_mpoa_init_cache(mpc); mpc->parameters.mpc_p1 = MPC_P1; mpc->parameters.mpc_p2 = MPC_P2; memset(mpc->parameters.mpc_p3, 0, sizeof(mpc->parameters.mpc_p3)); mpc->parameters.mpc_p4 = MPC_P4; mpc->parameters.mpc_p5 = MPC_P5; mpc->parameters.mpc_p6 = MPC_P6; mpcs = mpc; return mpc; } /* * * start_mpc() puts the MPC on line. All the packets destined * to the lec underneath us are now being monitored and * shortcuts will be established. * */ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev) { dprintk("(%s)\n", mpc->dev->name); if (!dev->netdev_ops) pr_info("(%s) not starting\n", dev->name); else { mpc->old_ops = dev->netdev_ops; mpc->new_ops = *mpc->old_ops; mpc->new_ops.ndo_start_xmit = mpc_send_packet; dev->netdev_ops = &mpc->new_ops; } } static void stop_mpc(struct mpoa_client *mpc) { struct net_device *dev = mpc->dev; dprintk("(%s)", mpc->dev->name); /* Lets not nullify lec device's dev->hard_start_xmit */ if (dev->netdev_ops != &mpc->new_ops) { dprintk_cont(" mpc already stopped, not fatal\n"); return; } dprintk_cont("\n"); dev->netdev_ops = mpc->old_ops; mpc->old_ops = NULL; /* close_shortcuts(mpc); ??? FIXME */ } static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); static const char *mpoa_device_type_string(char type) { switch (type) { case NON_MPOA: return "non-MPOA device"; case MPS: return "MPS"; case MPC: return "MPC"; case MPS_AND_MPC: return "both MPS and MPC"; } return "unspecified (non-MPOA) device"; } /* * lec device calls this via its netdev_priv(dev)->lane2_ops * ->associate_indicator() when it sees a TLV in LE_ARP packet. * We fill in the pointer above when we see a LANE2 lec initializing * See LANE2 spec 3.1.5 * * Quite a big and ugly function but when you look at it * all it does is to try to locate and parse MPOA Device * Type TLV. * We give our lec a pointer to this function and when the * lec sees a TLV it uses the pointer to call this function. * */ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr, const u8 *tlvs, u32 sizeoftlvs) { uint32_t type; uint8_t length, mpoa_device_type, number_of_mps_macs; const uint8_t *end_of_tlvs; struct mpoa_client *mpc; mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */ dprintk("(%s) received TLV(s), ", dev->name); dprintk("total length of all TLVs %d\n", sizeoftlvs); mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */ if (mpc == NULL) { pr_info("(%s) no mpc\n", dev->name); return; } end_of_tlvs = tlvs + sizeoftlvs; while (end_of_tlvs - tlvs >= 5) { type = ((tlvs[0] << 24) | (tlvs[1] << 16) | (tlvs[2] << 8) | tlvs[3]); length = tlvs[4]; tlvs += 5; dprintk(" type 0x%x length %02x\n", type, length); if (tlvs + length > end_of_tlvs) { pr_info("TLV value extends past its buffer, aborting parse\n"); return; } if (type == 0) { pr_info("mpoa: (%s) TLV type was 0, returning\n", dev->name); return; } if (type != TLV_MPOA_DEVICE_TYPE) { tlvs += length; continue; /* skip other TLVs */ } mpoa_device_type = *tlvs++; number_of_mps_macs = *tlvs++; dprintk("(%s) MPOA device type '%s', ", dev->name, mpoa_device_type_string(mpoa_device_type)); if (mpoa_device_type == MPS_AND_MPC && length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */ pr_info("(%s) short MPOA Device Type TLV\n", dev->name); continue; } if ((mpoa_device_type == MPS || mpoa_device_type == MPC) && length < 22 + number_of_mps_macs*ETH_ALEN) { pr_info("(%s) short MPOA Device Type TLV\n", dev->name); continue; } if (mpoa_device_type != MPS && mpoa_device_type != MPS_AND_MPC) { dprintk("ignoring non-MPS device "); if (mpoa_device_type == MPC) tlvs += 20; continue; /* we are only interested in MPSs */ } if (number_of_mps_macs == 0 && mpoa_device_type == MPS_AND_MPC) { pr_info("(%s) MPS_AND_MPC has zero MACs\n", dev->name); continue; /* someone should read the spec */ } dprintk_cont("this MPS has %d MAC addresses\n", number_of_mps_macs); /* * ok, now we can go and tell our daemon * the control address of MPS */ send_set_mps_ctrl_addr(tlvs, mpc); tlvs = copy_macs(mpc, mac_addr, tlvs, number_of_mps_macs, mpoa_device_type); if (tlvs == NULL) return; } if (end_of_tlvs - tlvs != 0) pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n", dev->name, end_of_tlvs - tlvs); } /* * Store at least advertizing router's MAC address * plus the possible MAC address(es) to mpc->mps_macs. * For a freshly allocated MPOA client mpc->mps_macs == 0. */ static const uint8_t *copy_macs(struct mpoa_client *mpc, const uint8_t *router_mac, const uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type) { int num_macs; num_macs = (mps_macs > 1) ? mps_macs : 1; if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */ if (mpc->number_of_mps_macs != 0) kfree(mpc->mps_macs); mpc->number_of_mps_macs = 0; mpc->mps_macs = kmalloc_array(ETH_ALEN, num_macs, GFP_KERNEL); if (mpc->mps_macs == NULL) { pr_info("(%s) out of mem\n", mpc->dev->name); return NULL; } } ether_addr_copy(mpc->mps_macs, router_mac); tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20; if (mps_macs > 0) memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN); tlvs += mps_macs*ETH_ALEN; mpc->number_of_mps_macs = num_macs; return tlvs; } static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) { in_cache_entry *entry; struct iphdr *iph; char *buff; __be32 ipaddr = 0; static struct { struct llc_snap_hdr hdr; __be32 tag; } tagged_llc_snap_hdr = { {0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c}}, 0 }; buff = skb->data + mpc->dev->hard_header_len; iph = (struct iphdr *)buff; ipaddr = iph->daddr; ddprintk("(%s) ipaddr 0x%x\n", mpc->dev->name, ipaddr); entry = mpc->in_ops->get(ipaddr, mpc); if (entry == NULL) { entry = mpc->in_ops->add_entry(ipaddr, mpc); if (entry != NULL) mpc->in_ops->put(entry); return 1; } /* threshold not exceeded or VCC not ready */ if (mpc->in_ops->cache_hit(entry, mpc) != OPEN) { ddprintk("(%s) cache_hit: returns != OPEN\n", mpc->dev->name); mpc->in_ops->put(entry); return 1; } ddprintk("(%s) using shortcut\n", mpc->dev->name); /* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */ if (iph->ttl <= 1) { ddprintk("(%s) IP ttl = %u, using LANE\n", mpc->dev->name, iph->ttl); mpc->in_ops->put(entry); return 1; } iph->ttl--; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); if (entry->ctrl_info.tag != 0) { ddprintk("(%s) adding tag 0x%x\n", mpc->dev->name, entry->ctrl_info.tag); tagged_llc_snap_hdr.tag = entry->ctrl_info.tag; skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ skb_push(skb, sizeof(tagged_llc_snap_hdr)); /* add LLC/SNAP header */ skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr, sizeof(tagged_llc_snap_hdr)); } else { skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ skb_push(skb, sizeof(struct llc_snap_hdr)); /* add LLC/SNAP header + tag */ skb_copy_to_linear_data(skb, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)); } atm_account_tx(entry->shortcut, skb); entry->shortcut->send(entry->shortcut, skb); entry->packets_fwded++; mpc->in_ops->put(entry); return 0; } /* * Probably needs some error checks and locking, not sure... */ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, struct net_device *dev) { struct mpoa_client *mpc; struct ethhdr *eth; int i = 0; mpc = find_mpc_by_lec(dev); /* this should NEVER fail */ if (mpc == NULL) { pr_info("(%s) no MPC found\n", dev->name); goto non_ip; } eth = (struct ethhdr *)skb->data; if (eth->h_proto != htons(ETH_P_IP)) goto non_ip; /* Multi-Protocol Over ATM :-) */ /* Weed out funny packets (e.g., AF_PACKET or raw). */ if (skb->len < ETH_HLEN + sizeof(struct iphdr)) goto non_ip; skb_set_network_header(skb, ETH_HLEN); if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5) goto non_ip; while (i < mpc->number_of_mps_macs) { if (ether_addr_equal(eth->h_dest, mpc->mps_macs + i * ETH_ALEN)) if (send_via_shortcut(skb, mpc) == 0) /* try shortcut */ return NETDEV_TX_OK; i++; } non_ip: return __netdev_start_xmit(mpc->old_ops, skb, dev, false); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) { int bytes_left; struct mpoa_client *mpc; struct atmmpc_ioc ioc_data; in_cache_entry *in_entry; __be32 ipaddr; bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc)); if (bytes_left != 0) { pr_info("mpoa:Short read (missed %d bytes) from userland\n", bytes_left); return -EFAULT; } ipaddr = ioc_data.ipaddr; if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF) return -EINVAL; mpc = find_mpc_by_itfnum(ioc_data.dev_num); if (mpc == NULL) return -EINVAL; if (ioc_data.type == MPC_SOCKET_INGRESS) { in_entry = mpc->in_ops->get(ipaddr, mpc); if (in_entry == NULL || in_entry->entry_state < INGRESS_RESOLVED) { pr_info("(%s) did not find RESOLVED entry from ingress cache\n", mpc->dev->name); if (in_entry != NULL) mpc->in_ops->put(in_entry); return -EINVAL; } pr_info("(%s) attaching ingress SVC, entry = %pI4\n", mpc->dev->name, &in_entry->ctrl_info.in_dst_ip); in_entry->shortcut = vcc; mpc->in_ops->put(in_entry); } else { pr_info("(%s) attaching egress SVC\n", mpc->dev->name); } vcc->proto_data = mpc->dev; vcc->push = mpc_push; return 0; } /* * */ static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev) { struct mpoa_client *mpc; in_cache_entry *in_entry; eg_cache_entry *eg_entry; mpc = find_mpc_by_lec(dev); if (mpc == NULL) { pr_info("(%s) close for unknown MPC\n", dev->name); return; } dprintk("(%s)\n", dev->name); in_entry = mpc->in_ops->get_by_vcc(vcc, mpc); if (in_entry) { dprintk("(%s) ingress SVC closed ip = %pI4\n", mpc->dev->name, &in_entry->ctrl_info.in_dst_ip); in_entry->shortcut = NULL; mpc->in_ops->put(in_entry); } eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc); if (eg_entry) { dprintk("(%s) egress SVC closed\n", mpc->dev->name); eg_entry->shortcut = NULL; mpc->eg_ops->put(eg_entry); } if (in_entry == NULL && eg_entry == NULL) dprintk("(%s) unused vcc closed\n", dev->name); } static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = (struct net_device *)vcc->proto_data; struct sk_buff *new_skb; eg_cache_entry *eg; struct mpoa_client *mpc; __be32 tag; char *tmp; ddprintk("(%s)\n", dev->name); if (skb == NULL) { dprintk("(%s) null skb, closing VCC\n", dev->name); mpc_vcc_close(vcc, dev); return; } skb->dev = dev; if (memcmp(skb->data, &llc_snap_mpoa_ctrl, sizeof(struct llc_snap_hdr)) == 0) { struct sock *sk = sk_atm(vcc); dprintk("(%s) control packet arrived\n", dev->name); /* Pass control packets to daemon */ skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return; } /* data coming over the shortcut */ atm_return(vcc, skb->truesize); mpc = find_mpc_by_lec(dev); if (mpc == NULL) { pr_info("(%s) unknown MPC\n", dev->name); return; } if (memcmp(skb->data, &llc_snap_mpoa_data_tagged, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */ ddprintk("(%s) tagged data packet arrived\n", dev->name); } else if (memcmp(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */ pr_info("(%s) Unsupported non-tagged data packet arrived. Purging\n", dev->name); dev_kfree_skb_any(skb); return; } else { pr_info("(%s) garbage arrived, purging\n", dev->name); dev_kfree_skb_any(skb); return; } tmp = skb->data + sizeof(struct llc_snap_hdr); tag = *(__be32 *)tmp; eg = mpc->eg_ops->get_by_tag(tag, mpc); if (eg == NULL) { pr_info("mpoa: (%s) Didn't find egress cache entry, tag = %u\n", dev->name, tag); purge_egress_shortcut(vcc, NULL); dev_kfree_skb_any(skb); return; } /* * See if ingress MPC is using shortcut we opened as a return channel. * This means we have a bi-directional vcc opened by us. */ if (eg->shortcut == NULL) { eg->shortcut = vcc; pr_info("(%s) egress SVC in use\n", dev->name); } skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag)); /* get rid of LLC/SNAP header */ new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length); /* LLC/SNAP is shorter than MAC header :( */ dev_kfree_skb_any(skb); if (new_skb == NULL) { mpc->eg_ops->put(eg); return; } skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */ skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header, eg->ctrl_info.DH_length); new_skb->protocol = eth_type_trans(new_skb, dev); skb_reset_network_header(new_skb); eg->latest_ip_addr = ip_hdr(new_skb)->saddr; eg->packets_rcvd++; mpc->eg_ops->put(eg); memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data)); netif_rx(new_skb); } static const struct atmdev_ops mpc_ops = { /* only send is required */ .close = mpoad_close, .send = msg_from_mpoad }; static struct atm_dev mpc_dev = { .ops = &mpc_ops, .type = "mpc", .number = 42, .lock = __SPIN_LOCK_UNLOCKED(mpc_dev.lock) /* members not explicitly initialised will be 0 */ }; static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg) { struct mpoa_client *mpc; struct lec_priv *priv; int err; if (mpcs == NULL) { mpc_timer_refresh(); /* This lets us now how our LECs are doing */ err = register_netdevice_notifier(&mpoa_notifier); if (err < 0) { timer_delete(&mpc_timer); return err; } } mpc = find_mpc_by_itfnum(arg); if (mpc == NULL) { dprintk("allocating new mpc for itf %d\n", arg); mpc = alloc_mpc(); if (mpc == NULL) return -ENOMEM; mpc->dev_num = arg; mpc->dev = find_lec_by_itfnum(arg); /* NULL if there was no lec */ } if (mpc->mpoad_vcc) { pr_info("mpoad is already present for itf %d\n", arg); return -EADDRINUSE; } if (mpc->dev) { /* check if the lec is LANE2 capable */ priv = netdev_priv(mpc->dev); if (priv->lane_version < 2) { dev_put(mpc->dev); mpc->dev = NULL; } else priv->lane2_ops->associate_indicator = lane2_assoc_ind; } mpc->mpoad_vcc = vcc; vcc->dev = &mpc_dev; vcc_insert_socket(sk_atm(vcc)); set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); if (mpc->dev) { char empty[ATM_ESA_LEN]; memset(empty, 0, ATM_ESA_LEN); start_mpc(mpc, mpc->dev); /* set address if mpcd e.g. gets killed and restarted. * If we do not do it now we have to wait for the next LE_ARP */ if (memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0) send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc); } __module_get(THIS_MODULE); return arg; } static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc) { struct k_message mesg; memcpy(mpc->mps_ctrl_addr, addr, ATM_ESA_LEN); mesg.type = SET_MPS_CTRL_ADDR; memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN); msg_to_mpoad(&mesg, mpc); } static void mpoad_close(struct atm_vcc *vcc) { struct mpoa_client *mpc; struct sk_buff *skb; mpc = find_mpc_by_vcc(vcc); if (mpc == NULL) { pr_info("did not find MPC\n"); return; } if (!mpc->mpoad_vcc) { pr_info("close for non-present mpoad\n"); return; } mpc->mpoad_vcc = NULL; if (mpc->dev) { struct lec_priv *priv = netdev_priv(mpc->dev); priv->lane2_ops->associate_indicator = NULL; stop_mpc(mpc); dev_put(mpc->dev); } mpc->in_ops->destroy_cache(mpc); mpc->eg_ops->destroy_cache(mpc); while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { atm_return(vcc, skb->truesize); kfree_skb(skb); } pr_info("(%s) going down\n", (mpc->dev) ? mpc->dev->name : "<unknown>"); module_put(THIS_MODULE); } /* * */ static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb) { struct mpoa_client *mpc = find_mpc_by_vcc(vcc); struct k_message *mesg = (struct k_message *)skb->data; WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); if (mpc == NULL) { pr_info("no mpc found\n"); return 0; } dprintk("(%s)", mpc->dev ? mpc->dev->name : "<unknown>"); switch (mesg->type) { case MPOA_RES_REPLY_RCVD: dprintk_cont("mpoa_res_reply_rcvd\n"); MPOA_res_reply_rcvd(mesg, mpc); break; case MPOA_TRIGGER_RCVD: dprintk_cont("mpoa_trigger_rcvd\n"); MPOA_trigger_rcvd(mesg, mpc); break; case INGRESS_PURGE_RCVD: dprintk_cont("nhrp_purge_rcvd\n"); ingress_purge_rcvd(mesg, mpc); break; case EGRESS_PURGE_RCVD: dprintk_cont("egress_purge_reply_rcvd\n"); egress_purge_rcvd(mesg, mpc); break; case MPS_DEATH: dprintk_cont("mps_death\n"); mps_death(mesg, mpc); break; case CACHE_IMPOS_RCVD: dprintk_cont("cache_impos_rcvd\n"); MPOA_cache_impos_rcvd(mesg, mpc); break; case SET_MPC_CTRL_ADDR: dprintk_cont("set_mpc_ctrl_addr\n"); set_mpc_ctrl_addr_rcvd(mesg, mpc); break; case SET_MPS_MAC_ADDR: dprintk_cont("set_mps_mac_addr\n"); set_mps_mac_addr_rcvd(mesg, mpc); break; case CLEAN_UP_AND_EXIT: dprintk_cont("clean_up_and_exit\n"); clean_up(mesg, mpc, DIE); break; case RELOAD: dprintk_cont("reload\n"); clean_up(mesg, mpc, RELOAD); break; case SET_MPC_PARAMS: dprintk_cont("set_mpc_params\n"); mpc->parameters = mesg->content.params; break; default: dprintk_cont("unknown message %d\n", mesg->type); break; } kfree_skb(skb); return 0; } /* Remember that this function may not do things that sleep */ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) { struct sk_buff *skb; struct sock *sk; if (mpc == NULL || !mpc->mpoad_vcc) { pr_info("mesg %d to a non-existent mpoad\n", mesg->type); return -ENXIO; } skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); if (skb == NULL) return -ENOMEM; skb_put(skb, sizeof(struct k_message)); skb_copy_to_linear_data(skb, mesg, sizeof(*mesg)); atm_force_charge(mpc->mpoad_vcc, skb->truesize); sk = sk_atm(mpc->mpoad_vcc); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return 0; } static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpoa_client *mpc; struct lec_priv *priv; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (strncmp(dev->name, "lec", 3)) return NOTIFY_DONE; /* we are only interested in lec:s */ switch (event) { case NETDEV_REGISTER: /* a new lec device was allocated */ priv = netdev_priv(dev); if (priv->lane_version < 2) break; priv->lane2_ops->associate_indicator = lane2_assoc_ind; mpc = find_mpc_by_itfnum(priv->itfnum); if (mpc == NULL) { dprintk("allocating new mpc for %s\n", dev->name); mpc = alloc_mpc(); if (mpc == NULL) { pr_info("no new mpc"); break; } } mpc->dev_num = priv->itfnum; mpc->dev = dev; dev_hold(dev); dprintk("(%s) was initialized\n", dev->name); break; case NETDEV_UNREGISTER: /* the lec device was deallocated */ mpc = find_mpc_by_lec(dev); if (mpc == NULL) break; dprintk("device (%s) was deallocated\n", dev->name); stop_mpc(mpc); dev_put(mpc->dev); mpc->dev = NULL; break; case NETDEV_UP: /* the dev was ifconfig'ed up */ mpc = find_mpc_by_lec(dev); if (mpc == NULL) break; if (mpc->mpoad_vcc != NULL) start_mpc(mpc, dev); break; case NETDEV_DOWN: /* the dev was ifconfig'ed down */ /* this means that the flow of packets from the * upper layer stops */ mpc = find_mpc_by_lec(dev); if (mpc == NULL) break; if (mpc->mpoad_vcc != NULL) stop_mpc(mpc); break; case NETDEV_REBOOT: case NETDEV_CHANGE: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_GOING_DOWN: break; default: break; } return NOTIFY_DONE; } /* * Functions which are called after a message is received from mpcd. * Msg is reused on purpose. */ static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) { __be32 dst_ip = msg->content.in_info.in_dst_ip; in_cache_entry *entry; entry = mpc->in_ops->get(dst_ip, mpc); if (entry == NULL) { entry = mpc->in_ops->add_entry(dst_ip, mpc); entry->entry_state = INGRESS_RESOLVING; msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } if (entry->entry_state == INGRESS_INVALID) { entry->entry_state = INGRESS_RESOLVING; msg->type = SND_MPOA_RES_RQST; msg->content.in_info = entry->ctrl_info; msg_to_mpoad(msg, mpc); entry->reply_wait = ktime_get_seconds(); mpc->in_ops->put(entry); return; } pr_info("(%s) entry already in resolving state\n", (mpc->dev) ? mpc->dev->name : "<unknown>"); mpc->in_ops->put(entry); } /* * Things get complicated because we have to check if there's an egress * shortcut with suitable traffic parameters we could use. */ static void check_qos_and_open_shortcut(struct k_message *msg, struct mpoa_client *client, in_cache_entry *entry) { __be32 dst_ip = msg->content.in_info.in_dst_ip; struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip); eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client); if (eg_entry && eg_entry->shortcut) { if (eg_entry->shortcut->qos.txtp.traffic_class & msg->qos.txtp.traffic_class & (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)) { if (eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR) entry->shortcut = eg_entry->shortcut; else if (eg_entry->shortcut->qos.txtp.max_pcr > 0) entry->shortcut = eg_entry->shortcut; } if (entry->shortcut) { dprintk("(%s) using egress SVC to reach %pI4\n", client->dev->name, &dst_ip); client->eg_ops->put(eg_entry); return; } } if (eg_entry != NULL) client->eg_ops->put(eg_entry); /* No luck in the egress cache we must open an ingress SVC */ msg->type = OPEN_INGRESS_SVC; if (qos && (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) { msg->qos = qos->qos; pr_info("(%s) trying to get a CBR shortcut\n", client->dev->name); } else memset(&msg->qos, 0, sizeof(struct atm_qos)); msg_to_mpoad(msg, client); } static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) { __be32 dst_ip = msg->content.in_info.in_dst_ip; in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc); dprintk("(%s) ip %pI4\n", mpc->dev->name, &dst_ip); ddprintk("(%s) entry = %p", mpc->dev->name, entry); if (entry == NULL) { pr_info("(%s) ARGH, received res. reply for an entry that doesn't exist.\n", mpc->dev->name); return; } ddprintk_cont(" entry_state = %d ", entry->entry_state); if (entry->entry_state == INGRESS_RESOLVED) { pr_info("(%s) RESOLVED entry!\n", mpc->dev->name); mpc->in_ops->put(entry); return; } entry->ctrl_info = msg->content.in_info; entry->time = ktime_get_seconds(); /* Used in refreshing func from now on */ entry->reply_wait = ktime_get_seconds(); entry->refresh_time = 0; ddprintk_cont("entry->shortcut = %p\n", entry->shortcut); if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL) { entry->entry_state = INGRESS_RESOLVED; mpc->in_ops->put(entry); return; /* Shortcut already open... */ } if (entry->shortcut != NULL) { pr_info("(%s) entry->shortcut != NULL, impossible!\n", mpc->dev->name); mpc->in_ops->put(entry); return; } check_qos_and_open_shortcut(msg, mpc, entry); entry->entry_state = INGRESS_RESOLVED; mpc->in_ops->put(entry); return; } static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) { __be32 dst_ip = msg->content.in_info.in_dst_ip; __be32 mask = msg->ip_mask; in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); if (entry == NULL) { pr_info("(%s) purge for a non-existing entry, ip = %pI4\n", mpc->dev->name, &dst_ip); return; } do { dprintk("(%s) removing an ingress entry, ip = %pI4\n", mpc->dev->name, &dst_ip); write_lock_bh(&mpc->ingress_lock); mpc->in_ops->remove_entry(entry, mpc); write_unlock_bh(&mpc->ingress_lock); mpc->in_ops->put(entry); entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); } while (entry != NULL); } static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) { __be32 cache_id = msg->content.eg_info.cache_id; eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc); if (entry == NULL) { dprintk("(%s) purge for a non-existing entry\n", mpc->dev->name); return; } write_lock_irq(&mpc->egress_lock); mpc->eg_ops->remove_entry(entry, mpc); write_unlock_irq(&mpc->egress_lock); mpc->eg_ops->put(entry); } static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry) { struct sock *sk; struct k_message *purge_msg; struct sk_buff *skb; dprintk("entering\n"); if (vcc == NULL) { pr_info("vcc == NULL\n"); return; } skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); if (skb == NULL) { pr_info("out of memory\n"); return; } skb_put(skb, sizeof(struct k_message)); memset(skb->data, 0, sizeof(struct k_message)); purge_msg = (struct k_message *)skb->data; purge_msg->type = DATA_PLANE_PURGE; if (entry != NULL) purge_msg->content.eg_info = entry->ctrl_info; atm_force_charge(vcc, skb->truesize); sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); dprintk("exiting\n"); } /* * Our MPS died. Tell our daemon to send NHRP data plane purge to each * of the egress shortcuts we have. */ static void mps_death(struct k_message *msg, struct mpoa_client *mpc) { eg_cache_entry *entry; dprintk("(%s)\n", mpc->dev->name); if (memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)) { pr_info("(%s) wrong MPS\n", mpc->dev->name); return; } /* FIXME: This knows too much of the cache structure */ read_lock_irq(&mpc->egress_lock); entry = mpc->eg_cache; while (entry != NULL) { purge_egress_shortcut(entry->shortcut, entry); entry = entry->next; } read_unlock_irq(&mpc->egress_lock); mpc->in_ops->destroy_cache(mpc); mpc->eg_ops->destroy_cache(mpc); } static void MPOA_cache_impos_rcvd(struct k_message *msg, struct mpoa_client *mpc) { uint16_t holding_time; eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc); holding_time = msg->content.eg_info.holding_time; dprintk("(%s) entry = %p, holding_time = %u\n", mpc->dev->name, entry, holding_time); if (entry == NULL && !holding_time) return; if (entry == NULL && holding_time) { entry = mpc->eg_ops->add_entry(msg, mpc); mpc->eg_ops->put(entry); return; } if (holding_time) { mpc->eg_ops->update(entry, holding_time); return; } write_lock_irq(&mpc->egress_lock); mpc->eg_ops->remove_entry(entry, mpc); write_unlock_irq(&mpc->egress_lock); mpc->eg_ops->put(entry); } static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc) { struct lec_priv *priv; int i, retval ; uint8_t tlv[4 + 1 + 1 + 1 + ATM_ESA_LEN]; tlv[0] = 00; tlv[1] = 0xa0; tlv[2] = 0x3e; tlv[3] = 0x2a; /* type */ tlv[4] = 1 + 1 + ATM_ESA_LEN; /* length */ tlv[5] = 0x02; /* MPOA client */ tlv[6] = 0x00; /* number of MPS MAC addresses */ memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */ memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN); dprintk("(%s) setting MPC ctrl ATM address to", mpc->dev ? mpc->dev->name : "<unknown>"); for (i = 7; i < sizeof(tlv); i++) dprintk_cont(" %02x", tlv[i]); dprintk_cont("\n"); if (mpc->dev) { priv = netdev_priv(mpc->dev); retval = priv->lane2_ops->associate_req(mpc->dev, mpc->dev->dev_addr, tlv, sizeof(tlv)); if (retval == 0) pr_info("(%s) MPOA device type TLV association failed\n", mpc->dev->name); retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL); if (retval < 0) pr_info("(%s) targetless LE_ARP request failed\n", mpc->dev->name); } } static void set_mps_mac_addr_rcvd(struct k_message *msg, struct mpoa_client *client) { if (client->number_of_mps_macs) kfree(client->mps_macs); client->number_of_mps_macs = 0; client->mps_macs = kmemdup(msg->MPS_ctrl, ETH_ALEN, GFP_KERNEL); if (client->mps_macs == NULL) { pr_info("out of memory\n"); return; } client->number_of_mps_macs = 1; } /* * purge egress cache and tell daemon to 'action' (DIE, RELOAD) */ static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action) { eg_cache_entry *entry; msg->type = SND_EGRESS_PURGE; /* FIXME: This knows too much of the cache structure */ read_lock_irq(&mpc->egress_lock); entry = mpc->eg_cache; while (entry != NULL) { msg->content.eg_info = entry->ctrl_info; dprintk("cache_id %u\n", entry->ctrl_info.cache_id); msg_to_mpoad(msg, mpc); entry = entry->next; } read_unlock_irq(&mpc->egress_lock); msg->type = action; msg_to_mpoad(msg, mpc); } static unsigned long checking_time; static void mpc_timer_refresh(void) { mpc_timer.expires = jiffies + (MPC_P2 * HZ); checking_time = mpc_timer.expires; add_timer(&mpc_timer); } static void mpc_cache_check(struct timer_list *unused) { struct mpoa_client *mpc = mpcs; static unsigned long previous_resolving_check_time; static unsigned long previous_refresh_time; while (mpc != NULL) { mpc->in_ops->clear_count(mpc); mpc->eg_ops->clear_expired(mpc); if (checking_time - previous_resolving_check_time > mpc->parameters.mpc_p4 * HZ) { mpc->in_ops->check_resolving(mpc); previous_resolving_check_time = checking_time; } if (checking_time - previous_refresh_time > mpc->parameters.mpc_p5 * HZ) { mpc->in_ops->refresh(mpc); previous_refresh_time = checking_time; } mpc = mpc->next; } mpc_timer_refresh(); } static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0; struct atm_vcc *vcc = ATM_SD(sock); if (cmd != ATMMPC_CTRL && cmd != ATMMPC_DATA) return -ENOIOCTLCMD; if (!capable(CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case ATMMPC_CTRL: err = atm_mpoa_mpoad_attach(vcc, (int)arg); if (err >= 0) sock->state = SS_CONNECTED; break; case ATMMPC_DATA: err = atm_mpoa_vcc_attach(vcc, (void __user *)arg); break; default: break; } return err; } static struct atm_ioctl atm_ioctl_ops = { .owner = THIS_MODULE, .ioctl = atm_mpoa_ioctl, }; static __init int atm_mpoa_init(void) { register_atm_ioctl(&atm_ioctl_ops); if (mpc_proc_init() != 0) pr_info("failed to initialize /proc/mpoa\n"); pr_info("mpc.c: initialized\n"); return 0; } static void __exit atm_mpoa_cleanup(void) { struct mpoa_client *mpc, *tmp; struct atm_mpoa_qos *qos, *nextqos; struct lec_priv *priv; mpc_proc_clean(); timer_delete_sync(&mpc_timer); unregister_netdevice_notifier(&mpoa_notifier); deregister_atm_ioctl(&atm_ioctl_ops); mpc = mpcs; mpcs = NULL; while (mpc != NULL) { tmp = mpc->next; if (mpc->dev != NULL) { stop_mpc(mpc); priv = netdev_priv(mpc->dev); if (priv->lane2_ops != NULL) priv->lane2_ops->associate_indicator = NULL; } ddprintk("about to clear caches\n"); mpc->in_ops->destroy_cache(mpc); mpc->eg_ops->destroy_cache(mpc); ddprintk("caches cleared\n"); kfree(mpc->mps_macs); memset(mpc, 0, sizeof(struct mpoa_client)); ddprintk("about to kfree %p\n", mpc); kfree(mpc); ddprintk("next mpc is at %p\n", tmp); mpc = tmp; } qos = qos_head; qos_head = NULL; while (qos != NULL) { nextqos = qos->next; dprintk("freeing qos entry %p\n", qos); kfree(qos); qos = nextqos; } } module_init(atm_mpoa_init); module_exit(atm_mpoa_cleanup); MODULE_DESCRIPTION("Multi-Protocol Over ATM (MPOA) driver"); MODULE_LICENSE("GPL"); |
| 1 1 910 906 13 2 1724 1728 1734 963 19 105 743 195 80 248 4 859 448 1433 345 31 29 2 1686 3 9 8 11 2 1 8 2 3 11 12 3 15 14 1 1717 1723 15 8 7 38 3 19 71 5 11533 782 768 14 10760 2471 3 1171 2346 5 4131 1 347 7 3 2 311 2175 9 2715 2524 733 16 5055 417 27 1 8 2470 8529 8555 11673 1737 76 212 211 13489 1681 13344 13430 344 30 853 11864 1611 11667 12560 1317 1309 1319 333 21 21 21 21 21 21 13359 258 166 236 2976 2980 669 665 668 1280 1242 36 16 2 15 2229 2234 1364 2219 2507 2179 2 47 2412 2409 2415 2178 2174 2179 47 47 47 5923 3611 197 5927 5926 5917 3607 3607 3602 197 197 197 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 | // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif |
| 81 81 58 46 47 11 9 2 4 1 54 56 56 56 6 6 6 1 27 27 5 5 5 22 22 23 8 1 1 1 6 6 5 5 5 3 2 5 2 3 2 1 3 4 4 4 3 1 2 1 15 13 6 4 3 1 1 13 13 2 9 2 1 4 1 1 7 7 1 3 5 6 31 31 1 2 14 14 8 6 5 9 8 6 16 2 2 27 21 6 27 8 7 1 1 1 1 5 2 3 1 24 3 21 22 2 2 2 22 20 1 28 28 5 1 1 22 6 18 1 8 23 24 23 1 24 2 24 1 1 25 7 2 5 4 16 25 2 1 24 21 22 2 1 24 24 2 3 12 12 1 1 2 2 8 1 9 1 1 1 1 1 5 1 7 1 1 6 11 3 3 2 1 19 19 19 3 16 10 10 10 6 9 1 7 10 7 10 10 5 10 10 7 10 3 5 5 3 61 61 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), * Pavel Kankovsky (for Linux 2.4.32) * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/icmpv6.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/transp_v6.h> #endif struct ping_table { struct hlist_head hash[PING_HTABLE_SIZE]; spinlock_t lock; }; static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_IPV6_MOD_GPL(pingv6_ops); static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; pr_debug("hash(%u) = %u\n", num, res); return res; } static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } int ping_get_port(struct sock *sk, unsigned short ident) { struct net *net = sock_net(sk); struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { u16 result = net->ipv4.ping_port_rover + 1; u32 i; for (i = 0; i < (1L << 16); i++, result++) { if (!result) continue; /* avoid zero */ hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) continue; isk2 = inet_sk(sk2); if (isk2->inet_num == result) goto next_port; } /* found */ net->ipv4.ping_port_rover = ident = result; break; next_port: ; } if (i >= (1L << 16)) goto fail; } else { hlist = ping_hashslot(&ping_table, net, ident); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) continue; isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c * doesn't turn off SO_REUSEADDR, and it doesn't expect * that other ping processes can steal its packets. */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } pr_debug("found port/ident = %d\n", ident); isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(net, sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } EXPORT_IPV6_MOD_GPL(ping_get_port); void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } spin_unlock(&ping_table.lock); } EXPORT_IPV6_MOD_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { dif = inet_iif(skb); sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { dif = inet6_iif(skb); sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif } else { return NULL; } sk_for_each_rcu(sk, hslot) { if (!net_eq(sock_net(sk), net)) continue; isk = inet_sk(sk); pr_debug("iterate\n"); if (isk->inet_num != ident) continue; if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, (int) isk->inet_num, &isk->inet_rcv_saddr, sk->sk_bound_dev_if); if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, (int) isk->inet_num, &sk->sk_v6_rcv_saddr, sk->sk_bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, &ipv6_hdr(skb)->daddr)) continue; #endif } else { continue; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && sk->sk_bound_dev_if != sdif) continue; goto exit; } sk = NULL; exit: return sk; } static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; int i; kgid_t low, high; int ret = 0; if (sk->sk_family == AF_INET6) sk->sk_ipv6only = 1; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; group_info = get_current_groups(); for (i = 0; i < group_info->ngroups; i++) { kgid_t gid = group_info->gid[i]; if (gid_lte(low, gid) && gid_lte(gid, high)) goto out_release_group; } ret = -EACCES; out_release_group: put_group_info(group_info); return ret; } EXPORT_IPV6_MOD_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt)); sk_common_release(sk); } EXPORT_IPV6_MOD_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr_unsized *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin_family != AF_INET && !(addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr == htonl(INADDR_ANY))) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) return 0; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST || (chk_addr_ret != RTN_LOCAL && !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; int addr_type, scoped, has_addr; struct net_device *dev = NULL; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); addr_type = ipv6_addr_type(&addr->sin6_addr); scoped = __ipv6_addr_needs_scope_id(addr_type); if ((addr_type != IPV6_ADDR_ANY && !(addr_type & IPV6_ADDR_UNICAST)) || (scoped && !addr->sin6_scope_id)) return -EINVAL; rcu_read_lock(); if (addr->sin6_scope_id) { dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); if (!dev) { rcu_read_unlock(); return -ENODEV; } } if (!dev && sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { rcu_read_unlock(); return -ENODEV; } } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; if (scoped) sk->sk_bound_dev_if = addr->sin6_scope_id; #endif } else { return -EAFNOSUPPORT; } return 0; } static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; struct ipv6_pinfo *np = inet6_sk(sk); sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; #endif } } /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; int err; int dif = sk->sk_bound_dev_if; err = ping_check_bind_addr(sk, isk, uaddr, addr_len); if (err) return err; lock_sock(sk); err = -EINVAL; if (isk->inet_num != 0) goto out; err = -EADDRINUSE; snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ sk->sk_bound_dev_if = dif; goto out; } ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, sk->sk_bound_dev_if); err = 0; if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #endif if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); #endif sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } EXPORT_IPV6_MOD_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. */ void ping_err(struct sk_buff *skb, int offset, u32 info) { int family; struct icmphdr *icmph; struct inet_sock *inet_sock; int type; int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; if (skb->protocol == htons(ETH_P_IP)) { family = AF_INET; type = icmp_hdr(skb)->type; code = icmp_hdr(skb)->code; icmph = (struct icmphdr *)(skb->data + offset); } else if (skb->protocol == htons(ETH_P_IPV6)) { family = AF_INET6; type = icmp6_hdr(skb)->icmp6_type; code = icmp6_hdr(skb)->icmp6_code; icmph = (struct icmphdr *) (skb->data + offset); } else { BUG(); } /* We assume the packet has already been checked by icmp_unreach */ if (!ping_supported(family, icmph->type, icmph->code)) return; pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", skb->protocol, type, code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (!sk) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } pr_debug("err on socket %p\n", sk); err = 0; harderr = 0; inet_sock = inet_sk(sk); if (skb->protocol == htons(ETH_P_IP)) { switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: /* This is not a real error but ping wants to see it. * Report it with some fake errno. */ err = EREMOTEIO; break; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); #endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { if (family == AF_INET) { ip_icmp_error(sk, skb, err, 0 /* no remote port */, info, (u8 *)icmph); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, info, (u8 *)icmph); #endif } } sk->sk_err = err; sk_error_report(sk); out: return; } EXPORT_IPV6_MOD_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer * starting from the payload. */ int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = from; if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, &pfh->msg->msg_iter)) return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } #endif return 0; } EXPORT_IPV6_MOD_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); if (!skb) return 0; pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); skb->ip_summed = CHECKSUM_NONE; return ip_push_pending_frames(sk, fl4); } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len) { u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; /* Must have at least a full ICMP header. */ if (len < icmph_len) return -EINVAL; /* * Check the flags. */ /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { type = ((struct icmphdr *) user_icmph)->type; code = ((struct icmphdr *) user_icmph)->code; #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { type = ((struct icmp6hdr *) user_icmph)->icmp6_type; code = ((struct icmp6hdr *) user_icmph)->icmp6_code; #endif } else { BUG(); } if (!ping_supported(family, type, code)) return -EINVAL; return 0; } EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, IP_OPTIONS_DATA_FIXED_SIZE); struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; int free = 0; __be32 saddr, daddr, faddr; u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; /* * Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; /* no remote port */ } ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = opt_copy; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; } scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); pfh.icmph.type = user_icmph.type; /* already checked */ pfh.icmph.code = user_icmph.code; /* ditto */ pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, sizeof(struct icmphdr), &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; struct sk_buff *skb; int copied, err; pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return inet_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* Don't bother checking the checksum */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address and add cmsg data. */ if (family == AF_INET) { DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { struct ipv6hdr *ip6 = ipv6_hdr(skb); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ip6->saddr; sin6->sin6_flowinfo = 0; if (inet6_test_bit(SNDFLOW, sk)) sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } if (inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); else if (skb->protocol == htons(ETH_P_IP) && inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { BUG(); } err = copied; done: skb_free_datagram(sk, skb); out: pr_debug("ping_recvmsg -> %d\n", err); return err; } EXPORT_IPV6_MOD_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } return SKB_NOT_DROPPED_YET; } int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb); /* * All we need to do is get the socket. */ enum skb_drop_reason ping_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); struct sock *sk; /* We assume the packet has already been checked by icmp_rcv */ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk) return __ping_queue_rcv_skb(sk, skb); kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); return SKB_DROP_REASON_NO_SOCKET; } EXPORT_IPV6_MOD_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; EXPORT_IPV6_MOD(ping_prot); #ifdef CONFIG_PROC_FS static struct sock *ping_get_first(struct seq_file *seq, int start) { struct sock *sk; struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { struct hlist_head *hslot; hslot = &ping_table.hash[state->bucket]; if (hlist_empty(hslot)) continue; sk_for_each(sk, hslot) { if (net_eq(sock_net(sk), net) && sk->sk_family == state->family) goto found; } } sk = NULL; found: return sk; } static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) { struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); do { sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) return ping_get_first(seq, state->bucket + 1); return sk; } static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = ping_get_first(seq, 0); if (sk) while (pos && (sk = ping_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_IPV6_MOD_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET); } void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = ping_get_idx(seq, 0); else sk = ping_get_next(seq, v); ++*pos; return sk; } EXPORT_IPV6_MOD_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } EXPORT_IPV6_MOD_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; ping_v4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations ping_v4_seq_ops = { .start = ping_v4_seq_start, .show = ping_v4_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v4_proc_init_net(struct net *net) { if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; net->ipv4.ping_port_rover = get_random_u16(); return 0; } static void __net_exit ping_v4_proc_exit_net(struct net *net) { remove_proc_entry("icmp", net->proc_net); } static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { unregister_pernet_subsys(&ping_v4_net_ops); } #endif void __init ping_init(void) { int i; for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_HEAD(&ping_table.hash[i]); spin_lock_init(&ping_table.lock); } |
| 98 30 53 125 76 53 38 19 123 78 54 38 19 15 12 9 19 17 6 9 4 19 17 1 1 2 2 12 12 7 7 24 63 18 60 64 4 1 18 5 15 18 47 37 3 1 60 1 39 39 62 1 5 5 65 64 65 65 65 79 50 9 26 3 7 79 63 80 81 54 54 54 69 68 68 69 81 31 27 17 1 2 3 3 2 5 4 8 4 2 6 2 2 2 110 110 81 53 37 28 28 56 10 10 10 73 73 16 16 13 16 68 60 1 1 10 10 10 10 3 7 48 10 11 6 7 7 6 1 1 7 39 38 39 85 85 3 81 84 84 84 85 85 85 85 83 49 85 21 76 77 75 2 77 26 20 20 20 17 3 11 10 19 20 20 20 20 17 17 11 9 11 11 9 23 49 49 40 41 41 30 11 6 37 40 41 41 39 37 6 11 7 28 41 41 41 11 11 61 61 61 49 14 61 41 21 3 3 3 3 3 3 57 13 19 33 1 33 2 10 10 44 42 2 44 2 2 23 11 1 13 12 32 32 2 31 16 14 16 5 14 6 12 16 11 11 1 3 89 88 24 66 66 66 57 5 2 60 12 31 5 8 1 1 2 4 86 86 3 82 66 66 68 86 86 77 44 20 20 61 18 80 19 60 3 77 80 79 27 2 25 25 1 26 28 27 28 27 1 1 1 1 1 1 29 29 29 29 14 1 1 1 1 57 22 32 28 10 34 34 30 4 3 18 13 3 24 24 24 19 7 7 7 1 7 34 23 23 22 23 35 35 81 82 51 34 2 1 1 1 9 9 9 1 6 7 2 9 8 9 9 9 5 7 1 1 7 1 6 1 1 1 1 1 1 4 5 7 2 5 5 5 5 4 4 1 5 5 7 7 7 7 7 7 6 7 1 1 5 5 7 7 7 34 34 34 34 35 35 34 2 34 20 1 1 1 22 23 23 5 19 23 23 23 20 4 23 23 23 26 27 27 27 27 3 24 3 27 9 9 9 9 1 1 1 22 23 22 23 17 2 1 2 15 17 8 5 3 1 1 1 1 1 1 1 1 3 5 5 5 5 5 5 5 5 5 5 8 8 5 4 1 51 51 50 7 25 79 1 78 73 3 5 5 45 21 18 18 45 22 7 2 2 2 57 14 71 22 49 71 71 71 58 12 1 7 67 1 70 75 75 35 88 88 87 88 79 2 2 2 2 2 10 9 12 12 2 10 9 9 2 6 2 14 14 14 7 7 7 20 20 20 5 1 4 4 1 11 2 5 7 7 1 6 6 3 5 2 1 9 9 4 5 9 9 9 3 6 2 2 3 7 7 7 7 7 7 7 7 6 1 6 4 3 1 1 3 3 3 3 7 7 7 7 80 64 25 25 10 10 35 2 23 10 33 6 28 19 6 5 6 3 5 8 1 1 23 1 23 24 64 31 16 57 50 48 50 50 50 42 41 42 114 114 114 113 112 114 114 113 96 97 97 96 6 93 50 18 3 18 17 18 18 14 14 14 4 5 9 1 13 2 2 2 2 66 59 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic hugetlb support. * (C) Nadia Yvette Chambers, April 2004 */ #include <linux/list.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/minmax.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> #include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> #include <linux/padata.h> #include <linux/pgalloc.h> #include <asm/page.h> #include <asm/tlb.h> #include <asm/setup.h> #include <linux/io.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" #include "hugetlb_internal.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; __initdata nodemask_t hugetlb_bootmem_nodes; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; /* * Due to ordering constraints across the init code for various * architectures, hugetlb hstate cmdline parameters can't simply * be early_param. early_param might call the setup function * before valid hugetlb page sizes are determined, leading to * incorrect rejection of valid hugepagesz= options. * * So, record the parameters early and consume them whenever the * init code is ready for them, by calling hugetlb_parse_params(). */ /* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ #define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) struct hugetlb_cmdline { char *val; int (*setup)(char *val); }; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; static unsigned long hugepage_allocation_threads __initdata; static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata; static int hstate_cmdline_index __initdata; static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata; static int hugetlb_param_index __initdata; static __init int hugetlb_add_param(char *s, int (*setup)(char *val)); static __init void hugetlb_parse_params(void); #define hugetlb_early_param(str, func) \ static __init int func##args(char *s) \ { \ return hugetlb_add_param(s, func); \ } \ early_param(str, func##args) /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes __ro_after_init; struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) return false; if (spool->max_hpages != -1) return spool->used_hpages == 0; if (spool->min_hpages != -1) return spool->rsv_hpages == spool->min_hpages; return true; } static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) { spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); kfree(spool); } } struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages) { struct hugepage_subpool *spool; spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = max_hpages; spool->hstate = h; spool->min_hpages = min_hpages; if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { kfree(spool); return NULL; } spool->rsv_hpages = min_hpages; return spool; } void hugepage_put_subpool(struct hugepage_subpool *spool) { unsigned long flags; spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; unlock_or_release_subpool(spool, flags); } /* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; if (!spool) return ret; spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) spool->used_hpages += delta; else { ret = -ENOMEM; goto unlock_ret; } } /* minimum size accounting */ if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on * behalf of subpool. Return difference. */ ret = delta - spool->rsv_hpages; spool->rsv_hpages = 0; } else { ret = 0; /* reserves already accounted for */ spool->rsv_hpages -= delta; } } unlock_ret: spin_unlock_irq(&spool->lock); return ret; } /* * Subpool accounting for freeing and unreserving pages. * Return the number of global page reservations that must be dropped. * The return value may only be different than the passed value (delta) * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; unsigned long flags; if (!spool) return delta; spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; /* minimum size accounting */ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else ret = spool->rsv_hpages + delta - spool->min_hpages; spool->rsv_hpages += delta; if (spool->rsv_hpages > spool->min_hpages) spool->rsv_hpages = spool->min_hpages; } /* * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ unlock_or_release_subpool(spool, flags); return ret; } static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); } /* * hugetlb vma_lock helper routines */ void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_read(&resv_map->rw_sema); } } void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_read(&resv_map->rw_sema); } } void hugetlb_vma_lock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_write(&resv_map->rw_sema); } } void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; return down_write_trylock(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); return down_write_trylock(&resv_map->rw_sema); } return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); lockdep_assert_held(&resv_map->rw_sema); } } void hugetlb_vma_lock_release(struct kref *kref) { struct hugetlb_vma_lock *vma_lock = container_of(kref, struct hugetlb_vma_lock, refs); kfree(vma_lock); } static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; /* * vma_lock structure may or not be released as a result of put, * it certainly will no longer be attached to vma so clear pointer. * Semaphore synchronizes access to vma_lock->vma field. */ vma_lock->vma = NULL; vma->vm_private_data = NULL; up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); } static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); /* no free for anon vmas, but still need to unlock */ up_write(&resv_map->rw_sema); } } static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* * Only present in sharable vmas. */ if (!vma || !__vma_shareable_lock(vma)) return; if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); __hugetlb_vma_unlock_write_put(vma_lock); } } /* * vma specific semaphore used for pmd sharing and fault/truncation * synchronization */ int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { /* * If we can not allocate structure, then vma can not * participate in pmd sharing. This is only a possible * performance enhancement and memory saving issue. * However, the lock is also used to synchronize page * faults with truncation. If the lock is not present, * unlikely races could leave pages in a file past i_size * until the file is removed. Warn in the unlikely case of * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); nrg->from = from; nrg->to = to; return nrg; } static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB nrg->reservation_counter = rg->reservation_counter; nrg->css = rg->css; if (rg->css) css_get(rg->css); #endif } /* Helper that records hugetlb_cgroup uncharge info. */ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, struct hstate *h, struct resv_map *resv, struct file_region *nrg) { #ifdef CONFIG_CGROUP_HUGETLB if (h_cg) { nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; /* * The caller will hold exactly one h_cg->css reference for the * whole contiguous reservation region. But this area might be * scattered when there are already some file_regions reside in * it. As a result, many file_regions may share only one css * reference. In order to ensure that one file_region must hold * exactly one h_cg->css reference, we should do css_get for * each file_region and leave the reference held by caller * untouched. */ css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in * a resv_map. */ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); } else { nrg->reservation_counter = NULL; nrg->css = NULL; } #endif } static void put_uncharge_info(struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB if (rg->css) css_put(rg->css); #endif } static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else return true; #endif } static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && has_same_uncharge_info(prg, rg)) { prg->to = rg->to; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); rg = prg; } nrg = list_next_entry(rg, link); if (&nrg->link != &resv->regions && nrg->from == rg->to && has_same_uncharge_info(nrg, rg)) { nrg->from = rg->from; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); } } static inline long hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { struct file_region *nrg; if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else { *regions_needed += 1; } return to - from; } /* * Must be called with resv->lock held. * * Calling this with regions_needed != NULL will count the number of pages * to be added but will not modify the linked list. And regions_needed will * indicate the number of file_regions needed in the cache to carry out to add * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; struct file_region *iter, *trg = NULL; struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ if (iter->to > last_accounted_offset) last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ if (iter->from >= t) { rg = iter->link.prev; break; } /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ if (iter->from > last_accounted_offset) add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, iter->from, h, h_cg, regions_needed); last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ if (!rg) rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); return add; } /* Must be called with resv->lock acquired. Will drop lock to allocate entries. */ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. * * This is a while loop because when we drop the lock, some other call * to region_add or region_del may have consumed some region_entries, * so we keep looping here until we finally have enough entries for * (adds_in_progress + regions_needed). */ while (resv->region_cache_count < (resv->adds_in_progress + regions_needed)) { to_allocate = resv->adds_in_progress + regions_needed - resv->region_cache_count; /* At this point, we should have enough entries in the cache * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); spin_unlock(&resv->lock); for (i = 0; i < to_allocate; i++) { trg = kmalloc(sizeof(*trg), GFP_KERNEL); if (!trg) goto out_of_memory; list_add(&trg->link, &allocated_regions); } spin_lock(&resv->lock); list_splice(&allocated_regions, &resv->region_cache); resv->region_cache_count += to_allocate; } return 0; out_of_memory: list_for_each_entry_safe(rg, trg, &allocated_regions, link) { list_del(&rg->link); kfree(rg); } return -ENOMEM; } /* * Add the huge page range represented by [f, t) to the reserve * map. Regions will be taken from the cache to fill in this range. * Sufficient regions should exist in the cache due to the previous * call to region_chg with the same range, but in some cases the cache will not * have sufficient entries due to races with other code doing region_add or * region_del. The extra needed entries will be allocated. * * regions_needed is the out value provided by a previous call to region_chg. * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. */ static long region_add(struct resv_map *resv, long f, long t, long in_regions_needed, struct hstate *h, struct hugetlb_cgroup *h_cg) { long add = 0, actual_regions_needed = 0; spin_lock(&resv->lock); retry: /* Count how many regions are actually needed to execute this add. */ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate * this add operation. Note that actual_regions_needed may be greater * than in_regions_needed, as the resv_map may have been modified since * the region_chg call. In this case, we need to make sure that we * allocate extra entries, such that we have enough for all the * existing adds_in_progress, plus the excess needed for this * operation. */ if (actual_regions_needed > in_regions_needed && resv->region_cache_count < resv->adds_in_progress + (actual_regions_needed - in_regions_needed)) { /* region_add operation of range 1 should never need to * allocate file_region entries. */ VM_BUG_ON(t - f <= 1); if (allocate_file_region_entries( resv, actual_regions_needed - in_regions_needed)) { return -ENOMEM; } goto retry; } add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); return add; } /* * Examine the existing reserve map and determine how many * huge pages in the specified range [f, t) are NOT currently * represented. This routine is called before a subsequent * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the * map. A number of new file_region structures is added to the cache as a * placeholder, for the subsequent region_add call to use. At least 1 * file_region structure is added. * * out_regions_needed is the number of regions added to the * resv->adds_in_progress. This value needs to be provided to a follow up call * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t, long *out_regions_needed) { long chg = 0; spin_lock(&resv->lock); /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; if (allocate_file_region_entries(resv, *out_regions_needed)) return -ENOMEM; resv->adds_in_progress += *out_regions_needed; spin_unlock(&resv->lock); return chg; } /* * Abort the in progress add operation. The adds_in_progress field * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort * is called to decrement the adds_in_progress counter. regions_needed * is the value returned by the region_chg call, it is used to decrement * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call. */ static void region_abort(struct resv_map *resv, long f, long t, long regions_needed) { spin_lock(&resv->lock); VM_BUG_ON(!resv->region_cache_count); resv->adds_in_progress -= regions_needed; spin_unlock(&resv->lock); } /* * Delete the specified range [f, t) from the reserve map. If the * t parameter is LONG_MAX, this indicates that ALL regions after f * should be deleted. Locate the regions which intersect [f, t) * and either trim, delete or split the existing regions. * * Returns the number of huge pages deleted from the reserve map. * In the normal case, the return value is zero or more. In the * case where a region must be split, a new region descriptor must * be allocated. If the allocation fails, -ENOMEM will be returned. * NOTE: If the parameter t == LONG_MAX, then we will never split * a region and possibly return -ENOMEM. Callers specifying * t == LONG_MAX do not need to check for -ENOMEM error. */ static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0; retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { /* * Skip regions before the range to be deleted. file_region * ranges are normally of the form [from, to). However, there * may be a "placeholder" entry in the map which is of the form * (from, to) with from == to. Check for placeholder entries * at the beginning of the range to be deleted. */ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; if (rg->from >= t) break; if (f > rg->from && t < rg->to) { /* Must split region */ /* * Check for an entry in the cache before dropping * lock and attempting allocation. */ if (!nrg && resv->region_cache_count > resv->adds_in_progress) { nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); resv->region_cache_count--; } if (!nrg) { spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) return -ENOMEM; goto retry; } del += t - f; hugetlb_cgroup_uncharge_file_region( resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; nrg->to = rg->to; copy_hugetlb_cgroup_uncharge_info(nrg, rg); INIT_LIST_HEAD(&nrg->link); /* Original entry is trimmed */ rg->to = f; list_add(&nrg->link, &rg->link); nrg = NULL; break; } if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; } if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - f, false); del += rg->to - f; rg->to = f; } } spin_unlock(&resv->lock); kfree(nrg); return del; } /* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed * and removed from the page cache. This routine will adjust the subpool * usage count, and the global reserve count if needed. By incrementing * these counts, the reserve map entry which could not be deleted will * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); if (!hugetlb_acct_memory(h, 1)) reserved = true; } else if (!rsv_adjust) { reserved = true; } if (!reserved) pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t). */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; long seg_to; if (rg->to <= f) continue; if (rg->from >= t) break; seg_from = max(rg->from, f); seg_to = min(rg->to, t); chg += seg_to - seg_from; } spin_unlock(&resv->lock); return chg; } /* * Convert the address within this vma to the page offset within * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> huge_page_shift(h)) + (vma->vm_pgoff >> huge_page_order(h)); } /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. * * Folios in this VMA will be aligned to, and at least the size of the * number of bytes returned by this function. * * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->pagesize) return vma->vm_ops->pagesize(vma); return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On * architectures where it differs, an architecture-specific 'strong' * version of this symbol is required. */ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. */ #define HPAGE_RESV_OWNER (1UL << 0) #define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. * * The private mapping reservation is represented in a subtly different * manner to a shared mapping. A shared mapping has a region map associated * with the underlying file, this region map represents the backing file * pages which have ever had a reservation assigned which this persists even * after the page is instantiated. A private mapping has a region map * associated with the original mmap which is attached to all VMAs which * reference it, this region map represents those offsets which have consumed * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { return (unsigned long)vma->vm_private_data; } static void set_vma_private_data(struct vm_area_struct *vma, unsigned long value) { vma->vm_private_data = (void *)value; } static void resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, struct hugetlb_cgroup *h_cg, struct hstate *h) { #ifdef CONFIG_CGROUP_HUGETLB if (!h_cg || !h) { resv_map->reservation_counter = NULL; resv_map->pages_per_hpage = 0; resv_map->css = NULL; } else { resv_map->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; resv_map->pages_per_hpage = pages_per_huge_page(h); resv_map->css = &h_cg->css; } #endif } struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); if (!resv_map || !rg) { kfree(resv_map); kfree(rg); return NULL; } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /* * Initialize these to 0. On shared mappings, 0's here indicate these * fields don't do cgroup accounting. On private mappings, these will be * re-initialized to the proper values, to indicate that hugetlb cgroup * reservations are to be un-charged from here. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); INIT_LIST_HEAD(&resv_map->region_cache); list_add(&rg->link, &resv_map->region_cache); resv_map->region_cache_count = 1; return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); struct list_head *head = &resv_map->region_cache; struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ region_del(resv_map, 0, LONG_MAX); /* ... and any entries left in the cache */ list_for_each_entry_safe(rg, trg, head, link) { list_del(&rg->link); kfree(rg); } VM_BUG_ON(resv_map->adds_in_progress); kfree(resv_map); } static inline struct resv_map *inode_resv_map(struct inode *inode) { /* * At inode evict time, i_mapping may not point to the original * address space within the inode. This original address space * contains the pointer to the resv_map. So, always use the * address space embedded within the inode. * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; return inode_resv_map(inode); } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); } } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); desc->private_data = map; } static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) { VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); return ((unsigned long)desc->private_data) & flag; } bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && get_vma_private_data(vma) & ~HPAGE_RESV_MASK && is_vma_resv_set(vma, HPAGE_RESV_OWNER); } void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data * - For shared mappings this is a per-vma semaphore that may be * allocated in a subsequent call to hugetlb_vm_op_open. * Before clearing, make sure pointer is not associated with vma * as this will leak the structure. This is the case when called * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; } else { vma->vm_private_data = NULL; } } /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { /* * Clear the old hugetlb private page reservation. * It has already been transferred to new_vma. * * During a mremap() operation of a hugetlb vma we call move_vma() * which copies vma into new_vma and unmaps vma. After the copy * operation both new_vma and vma share a reference to the resv_map * struct, and at that point vma is about to be unmapped. We don't * want to return the reservation to the pool at unmap of vma because * the reservation still lives on in new_vma, so simply decrement the * ref here and remove the resv_map reference from this vma. */ struct resv_map *reservations = vma_resv_map(vma); if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_put_hugetlb_cgroup_uncharge_info(reservations); kref_put(&reservations->refs, resv_map_release); } hugetlb_dup_vma_private(vma); } static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; folio_set_hugetlb_freed(folio); } static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid) { struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { if (pin && !folio_is_longterm_pinnable(folio)) continue; if (folio_test_hwpoison(folio)) continue; if (is_migrate_isolate_page(&folio->page)) continue; list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return folio; } return NULL; } static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; int node = NUMA_NO_NODE; /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */ if (nid == NUMA_NO_NODE) nid = numa_node_id(); zonelist = node_zonelist(nid, gfp_mask); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; /* * no need to ask again on the same node. Pool is node rather than * zone aware */ if (zone_to_nid(zone) == node) continue; node = zone_to_nid(zone); folio = dequeue_hugetlb_folio_node_exact(h, node); if (folio) return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return NULL; } static unsigned long available_huge_pages(struct hstate *h) { return h->free_huge_pages - h->resv_huge_pages; } static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; int nid; /* * gbl_chg==1 means the allocation requires a new page that was not * reserved before. Making sure there's at least one free page. */ if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; err: return NULL; } #if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC) static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; folio = hugetlb_cma_alloc_frozen_folio(order, gfp_mask, nid, nodemask); if (folio) return folio; if (hugetlb_cma_exclusive_alloc()) return NULL; folio = (struct folio *)alloc_contig_frozen_pages(1 << order, gfp_mask, nid, nodemask); return folio; } #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE || !CONFIG_CONTIG_ALLOC */ static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* * Remove hugetlb folio from lists. * If vmemmap exists for the folio, clear the hugetlb flag so that the * folio appears as just a compound page. Otherwise, wait until after * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held. */ void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic_no_runtime(h)) return; list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } if (adjust_surplus) { h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } /* * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) __folio_clear_hugetlb(folio); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; if (adjust_surplus) { h->surplus_huge_pages++; h->surplus_huge_pages_node[nid]++; } __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above * folio_change_private(folio, NULL) cleared it. */ folio_set_hugetlb_vmemmap_optimized(folio); arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic_no_runtime(h)) return; /* * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; /* * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the * page and put the page back on the hugetlb free list and treat * as a surplus page. */ add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } /* * If vmemmap pages were allocated above, then we need to clear the * hugetlb flag under the hugetlb lock. */ if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } /* * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); if (folio_test_hugetlb_cma(folio)) hugetlb_cma_free_frozen_folio(folio); else free_frozen_pages(&folio->page, folio_order(folio)); } /* * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. * * free_hpage_workfn() locklessly retrieves the linked list of pages to be * freed and frees them one-by-one. As the page->mapping pointer is going * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node * structure of a lockless linked list of huge pages to be freed. */ static LLIST_HEAD(hpage_freelist); static void free_hpage_workfn(struct work_struct *work) { struct llist_node *node; node = llist_del_all(&hpage_freelist); while (node) { struct folio *folio; struct hstate *h; folio = container_of((struct address_space **)node, struct folio, mapping); node = node->next; folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ h = size_to_hstate(folio_size(folio)); __update_and_free_hugetlb_folio(h, folio); cond_resched(); } } static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { __update_and_free_hugetlb_folio(h, folio); return; } /* * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. * * Only call schedule_work() if hpage_freelist is previously * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void bulk_vmemmap_restore_error(struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; if (!list_empty(non_hvo_folios)) { /* * Free any restored hugetlb pages so that restore of the * entire list can be retried. * The idea is that in the common case of ENOMEM errors freeing * hugetlb pages with vmemmap we will free up memory so that we * can allocate vmemmap for more hugetlb pages. */ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } else { /* * In the case where there are no folios which can be * immediately freed, we loop through the list trying to restore * vmemmap individually in the hope that someone elsewhere may * have done something to cause success (such as freeing some * memory). If unable to restore a hugetlb page, the hugetlb * page is made a surplus page and removed from the list. * If are able to restore vmemmap and free one hugetlb page, we * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); break; } } } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *folio_list) { long ret; struct folio *folio, *t_folio; LIST_HEAD(non_hvo_folios); /* * First allocate required vmemmmap (if necessary) for all folios. * Carefully handle errors and free up any available hugetlb pages * in an effort to make forward progress. */ retry: ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); if (ret < 0) { bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); goto retry; } /* * At this point, list should be empty, ret should be >= 0 and there * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; for_each_hstate(h) { if (huge_page_size(h) == size) return h; } return NULL; } void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); hugetlb_set_folio_subpool(folio, NULL); if (folio_test_anon(folio)) __ClearPageAnonExclusive(&folio->page); folio->mapping = NULL; restore_reserve = folio_test_hugetlb_restore_reserve(folio); folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { /* * A return code of zero implies that the subpool will be * under its minimum size if the reservation is not restored * after page is free. Therefore, force restore_reserve * operation. */ if (hugepage_subpool_put_pages(spool, 1) == 0) restore_reserve = true; } spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); hugetlb_cgroup_uncharge_folio(hstate_index(h), pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h)); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } /* * Must be called with the hugetlb lock held */ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[folio_nid(folio)]++; } void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } /* * Find and lock address space (mapping) in write mode. * * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; if (i_mmap_trylock_write(mapping)) return mapping; return NULL; } static struct folio *alloc_buddy_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; bool alloc_try_hard = true; /* * By default we always try hard to allocate the folio with * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a * folio this indicates an overall state change. Clear bit so * that we resume normal 'try hard' allocations. */ if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); return folio; } static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; int order = huge_page_order(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); if (order_is_gigantic(order)) folio = alloc_gigantic_frozen_folio(order, gfp_mask, nid, nmask); else folio = alloc_buddy_frozen_folio(order, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(folio); return folio; } /* * Common helper to allocate a fresh hugetlb folio. All specific allocators * should use this function to get new hugetlb folio * * Note that returned folio is 'frozen': ref count of head page and all tail * pages is zero, and the accounting must be done in the caller. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (folio) hugetlb_vmemmap_optimize_folio(h, folio); return folio; } void prep_and_add_allocated_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Allocates a fresh hugetlb page in a node interleaved manner. The page * will later be added to the appropriate hugetlb pool. */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry, int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) return folio; } return NULL; } /* * Remove huge page from pool from next node to free. Attempt to keep * persistent huge pages more or less balanced over allowed nodes. * This routine only 'removes' the hugetlb page. The caller must make * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { folio = list_entry(h->hugepage_freelists[node].next, struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } return folio; } /* * Dissolve a given free hugetlb folio into free buddy pages. This function * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages * when the system is under memory pressure and the feature of * freeing unused vmemmap pages associated with each hugetlb page * is enabled. * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use * (allocated or reserved.) * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); bool adjust_surplus = false; if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); /* * Theoretically, we should return -EBUSY when we * encounter this race. In fact, we have a chance * to successfully dissolve the page if we do a * retry. Because the race window is quite small. * If we seize this opportunity, it is an optimization * for increasing the success rate of dissolving page. */ goto retry; } if (h->surplus_huge_pages_node[folio_nid(folio)]) adjust_surplus = true; remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* * Normally update_and_free_hugtlb_folio will allocate required vmemmmap * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. * * The folio_test_hugetlb check here is because * remove_hugetlb_folio will clear hugetlb folio flag for * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } } else { rc = 0; } update_and_free_hugetlb_folio(h, folio, false); return rc; } out: spin_unlock_irq(&hugetlb_lock); return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. * Also note that if dissolve_free_hugetlb_folio() returns with an error, all * free hugetlb folios that were dissolved before that error are lost. */ int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; if (!hugepages_supported()) return rc; order = huge_page_order(&default_hstate); for_each_hstate(h) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { folio = pfn_folio(pfn); rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } return rc; } /* * Allocates a fresh surplus page from the page allocator. */ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; if (hstate_is_gigantic_no_runtime(h)) return NULL; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; spin_unlock_irq(&hugetlb_lock); folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ account_new_hugetlb_folio(h, folio); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); free_huge_folio(folio); return NULL; } h->surplus_huge_pages++; h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); return folio; } static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) return NULL; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); account_new_hugetlb_folio(h, folio); spin_unlock_irq(&hugetlb_lock); /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ folio_set_hugetlb_temporary(folio); return folio; } /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; } struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { struct folio *folio; spin_lock_irq(&hugetlb_lock); if (!h->resv_huge_pages) { spin_unlock_irq(&hugetlb_lock); return NULL; } folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) h->resv_huge_pages--; spin_unlock_irq(&hugetlb_lock); return folio; } /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { struct folio *folio; folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); return folio; } } spin_unlock_irq(&hugetlb_lock); /* We cannot fallback to other nodes, as we could break the per-node pool. */ if (!allow_alloc_fallback) gfp_mask |= __GFP_THISNODE; return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA struct mempolicy *mpol = get_task_policy(current); /* * Only enforce MPOL_BIND policy which overlaps with cpuset policy * (from policy_nodemask) specifically for hugetlb case */ if (mpol->mode == MPOL_BIND && (apply_policy_zone(mpol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) return &mpol->nodes; #endif return NULL; } /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); struct folio *folio, *tmp; int ret; long i; long needed, allocated; bool alloc_ok = true; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); if (mbind_nodemask) nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); else alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } allocated = 0; ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; /* * It is okay to use NUMA_NO_NODE because we use numa_mem_id() * down the road to pick the current node if that is the case. */ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; } list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; /* * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { if (alloc_ok) goto retry; /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ goto free; } /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but * before they are reserved. */ needed += allocated; h->resv_huge_pages += delta; ret = 0; /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. * Pages have no ref count, call free_huge_folio directly. */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; } /* * This routine has two main purposes: * 1) Decrement the reservation count (resv_huge_pages) by the value passed * in unused_resv_pages. This corresponds to the prior adjustments made * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; if (hstate_is_gigantic_no_runtime(h)) goto out; /* * Part (or even all) of the reservation could have been backed * by pre-allocated pages. Only free surplus pages. */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* * We want to release as many surplus pages as possible, spread * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { struct folio *folio; folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); if (!folio) goto out; list_add(&folio->lru, &page_list); } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * vma_needs_reservation, vma_commit_reservation and vma_end_reservation * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called * to add the page to the reservation map. If the page allocation fails, * the reservation must be ended instead of committed. vma_end_reservation * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. * * vma_add_reservation is used in error paths where a reservation must * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. * * vma_del_reservation is used in error paths where an entry in the reserve * map was created during huge page allocation and must be removed. It is to * be called after calling vma_needs_reservation to determine if a reservation * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; long ret; long dummy_out_regions_needed; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); switch (mode) { case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); /* We assume that vma_reservation_* routines always operate on * 1 page, and that adding to resv map a 1 page entry can only * ever require 1 region. */ VM_BUG_ON(dummy_out_regions_needed != 1); break; case VMA_COMMIT_RESV: ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); break; case VMA_END_RESV: region_abort(resv, idx, idx + 1, 1); ret = 0; break; case VMA_ADD_RESV: if (vma->vm_flags & VM_MAYSHARE) { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } else { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } break; case VMA_DEL_RESV: if (vma->vm_flags & VM_MAYSHARE) { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } else { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } break; default: BUG(); } if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. * * In most cases, reserves always exist for private mappings. * However, a file associated with mapping could have been * hole punched or truncated after reserves were consumed. * As subsequent fault on such a range will not use reserves. * Subtle - The reserve map for private mappings has the * opposite meaning than that of shared mappings. If NO * entry is in the reserve map, it means a reservation exists. * If an entry exists in the reserve map, it means the * reservation has already been consumed. As a result, the * return value of this routine is the opposite of the * value returned from reserve map manipulation routines above. */ if (ret > 0) return 0; if (ret == 0) return 1; return ret; } static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); } static void vma_end_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } static long vma_add_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } static long vma_del_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); } /* * This routine is called to restore reservation information on error paths. * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the folio consumed the reservation. * hugetlb_restore_reserve is set in the folio. * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_folio later in the error path will increment the * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio) { long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else vma_end_reservation(h, vma, address); } else { if (!rc) { /* * This indicates there is an entry in the reserve map * not added by alloc_hugetlb_folio. We know it was added * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ rc = vma_del_reservation(h, vma, address); if (rc < 0) /* * VERY rare out of memory condition. Since * we can not delete the entry, set * hugetlb_restore_reserve so that the reserve * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from * vma_needs_reservation call. Memory allocation is * only attempted if a new entry is needed. Therefore, * this implies there is not an entry in the * reserve map. * * For shared mappings, no entry in the map indicates * no reservation. We are done. */ if (!(vma->vm_flags & VM_MAYSHARE)) /* * For private mappings, no entry indicates * a reservation is present. Since we can * not add an entry, set hugetlb_restore_reserve * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else { /* * No reservation present, do nothing */ vma_end_reservation(h, vma, address); } } } /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask; struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: /* * The old_folio might have been dissolved from under our feet, so make sure * to carefully check the state under the lock. */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_folio too. */ goto free_new; } else if (folio_ref_count(old_folio)) { bool isolated; /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ spin_unlock_irq(&hugetlb_lock); cond_resched(); goto retry; } else { h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, NULL); if (!new_folio) return -ENOMEM; goto retry; } /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling account_new_hugetlb_folio() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ remove_hugetlb_folio(h, old_folio, false); /* * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ account_new_hugetlb_folio(h, new_folio); enqueue_hugetlb_folio(h, new_folio); /* * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); if (new_folio) update_and_free_hugetlb_folio(h, new_folio, false); return ret; } int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { int ret = -EBUSY; /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0; /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ if (order_is_gigantic(folio_order(folio))) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } /* * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn * range with new folios. * @start_pfn: start pfn of the given pfn range * @end_pfn: end pfn of the given pfn range * Returns 0 on success, otherwise negated error. */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long nr = 0; struct page *page; struct hstate *h; LIST_HEAD(list); int ret = 0; /* Avoid pfn iterations if no free non-gigantic huge pages */ for_each_hstate(h) { if (hstate_is_gigantic(h)) continue; nr += h->free_huge_pages; if (nr) break; } if (!nr) return 0; while (start_pfn < end_pfn) { page = pfn_to_page(start_pfn); nr = 1; if (PageHuge(page) || PageCompound(page)) { struct folio *folio = page_folio(page); nr = folio_nr_pages(folio) - folio_page_idx(folio, page); /* * Don't disrupt normal path by vainly holding * hugetlb_lock */ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { if (order_is_gigantic(folio_order(folio))) { ret = -ENOMEM; break; } ret = alloc_and_dissolve_hugetlb_folio(folio, &list); if (ret) break; putback_movable_pages(&list); } } else if (PageBuddy(page)) { /* * Buddy order check without zone lock is unsafe and * the order is maybe invalid, but race should be * small, and the worst thing is skipping free hugetlb. */ const unsigned int order = buddy_order_unsafe(page); if (order <= MAX_PAGE_ORDER) nr = 1UL << order; } start_pfn += nr; } return ret; } void wait_for_freed_hugetlb_folios(void) { if (llist_empty(&hpage_freelist)) return; flush_work(&free_hpage_work); } typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv * count either can be reused (0), or an extra needed (1). */ MAP_CHG_REUSE = 0, MAP_CHG_NEEDED = 1, /* * Cannot use per-vma resv count can be used, hence a new resv * count is enforced. * * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the * transaction. Hence it needs to differentiate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; /* * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW * faults of hugetlb private mappings on top of a non-page-cache folio (in * which case even if there's a private vma resv map it won't cover such * allocation). New call sites should (probably) never set it to true!! * When it's set, the allocation will bypass all vma level reservations. */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); /* Whether we need a separate per-vma reservation? */ if (cow_from_owner) { /* * Special case! Since it's a CoW on top of a reserved * page, the private resv map doesn't count. So it cannot * consume the per-vma resv map even if it's reserved. */ map_chg = MAP_CHG_ENFORCED; } else { /* * Examine the region/reserve map to determine if the process * has a reservation for the page to be allocated. A return * code of zero indicates a reservation exists (no change). */ retval = vma_needs_reservation(h, vma, addr); if (retval < 0) return ERR_PTR(-ENOMEM); map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* * Whether we need a separate global reservation? * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. * Or if it can get one from the pool reservation directly. */ if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; } else { /* * If we have the vma reservation ready, no need for extra * global reservation. */ gbl_chg = 0; } /* * If this allocation is not consuming a per-vma reservation, * charge the hugetlb cgroup now. */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_uncharge_cgroup_reservation; spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } /* * Either dequeued or buddy-allocated folio needs to add special * mark to the folio when it consumes a global reservation. */ if (!gbl_chg) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } spin_unlock_irq(&hugetlb_lock); hugetlb_set_folio_subpool(folio, spool); if (map_chg != MAP_CHG_ENFORCED) { /* commit() is only needed if the map_chg is not enforced */ retval = vma_commit_reservation(h, vma, addr); /* * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { long rsv_adjust; rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); spin_lock_irq(&hugetlb_lock); hugetlb_cgroup_uncharge_folio_rsvd( hstate_index(h), pages_per_huge_page(h), folio); spin_unlock_irq(&hugetlb_lock); } } ret = mem_cgroup_charge_hugetlb(folio, gfp); /* * Unconditionally increment NR_HUGETLB here. If it turns out that * mem_cgroup_charge_hugetlb failed, then immediately free the page and * decrement NR_HUGETLB. */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); if (ret == -ENOMEM) { free_huge_folio(folio); return ERR_PTR(-ENOMEM); } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: /* * put page to subpool iff the quota of subpool's rsv_hpages is used * during hugepage_subpool_get_pages. */ if (map_chg && !gbl_chg) { gbl_reserve = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -gbl_reserve); } out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) { struct huge_bootmem_page *m; int listnode = nid; if (hugetlb_early_cma(h)) m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact); else { if (node_exact) m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); else { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); /* * For pre-HVO to work correctly, pages need to be on * the list for the node they were actually allocated * from. That node may be different in the case of * fallback by memblock_alloc_try_nid_raw. So, * extract the actual node first. */ if (m) listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m))); } if (m) { m->flags = 0; m->cma = NULL; } } if (m) { /* * Use the beginning of the huge page to store the * huge_bootmem_page struct (until gather_bootmem * puts them into the mem_map). * * Put them into a private list first because mem_map * is not up yet. */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages[listnode]); m->hstate = h; } return m; } int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = alloc_bootmem(h, node, true); if (!m) return 0; goto found; } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &hugetlb_bootmem_nodes) { m = alloc_bootmem(h, node, false); if (!m) return 0; goto found; } found: /* * Only initialize the head struct page in memmap_init_reserved_pages, * rest of the struct pages will be initialized by the HugeTLB * subsystem itself. * The head struct page is used to get folio information by the HugeTLB * subsystem like zone id and node id. */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); return 1; } /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { enum zone_type zone = folio_zonenum(folio); int nid = folio_nid(folio); struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; /* * As we marked all tail pages with memblock_reserved_mark_noinit(), * we must initialize them ourselves here. */ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); set_page_count(page, 0); } } static void __init hugetlb_folio_init_vmemmap(struct folio *folio, struct hstate *h, unsigned long nr_pages) { int ret; /* * This is an open-coded prep_compound_page() whereby we avoid * walking pages twice by initializing/preparing+freezing them in the * same go. */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head(&folio->page, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_HVO; } static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_CMA; } /* * memblock-allocated pageblocks might not have the migrate type set * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE) * here, or MIGRATE_CMA if this was a page allocated through an early CMA * reservation. * * In case of vmemmap optimized folios, the tail vmemmap pages are mapped * read-only, but that's ok - for sparse vmemmap this does not write to * the page structure. */ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, struct hstate *h) { unsigned long nr_pages = pages_per_huge_page(h), i; WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio))); for (i = 0; i < nr_pages; i += pageblock_nr_pages) { if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else init_pageblock_migratetype(folio_page(folio, i), MIGRATE_MOVABLE, false); } } static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* * If HVO fails, initialize all tail struct pages * We do not worry about potential long lock hold * time as this is early in boot and there should * be no contention. */ hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } bool __init hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m) { unsigned long start_pfn; bool valid; if (m->flags & HUGE_BOOTMEM_ZONES_VALID) { /* * Already validated, skip check. */ return true; } if (hugetlb_bootmem_page_earlycma(m)) { valid = cma_validate_zones(m->cma); goto out; } start_pfn = virt_to_phys(m) >> PAGE_SHIFT; valid = !pfn_range_intersects_zones(nid, start_pfn, pages_per_huge_page(m->hstate)); out: if (!valid) hstate_boot_nrinvalid[hstate_index(m->hstate)]++; return valid; } /* * Free a bootmem page that was found to be invalid (intersecting with * multiple zones). * * Since it intersects with multiple zones, we can't just do a free * operation on all pages at once, but instead have to walk all * pages, freeing them one by one. */ static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page, struct hstate *h) { unsigned long npages = pages_per_huge_page(h); unsigned long pfn; while (npages--) { pfn = page_to_pfn(page); __init_page_from_nid(pfn, nid); free_reserved_page(page); page++; } } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m, *tm; struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Can't use this page. Initialize the * page structures if that hasn't already * been done, and give them to the page * allocator. */ hugetlb_bootmem_free_invalid_page(nid, page, h); continue; } /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(folio); if (hugetlb_bootmem_page_prehvo(m)) /* * If pre-HVO was done, just set the * flag, the HVO code will then skip * this folio. */ folio_set_hugetlb_vmemmap_optimized(folio); if (hugetlb_bootmem_page_earlycma(m)) folio_set_hugetlb_cma(folio); list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. * * For CMA pages, this is done in init_cma_pageblock * (via hugetlb_bootmem_init_migratetype), so skip it here. */ if (!folio_test_hugetlb_cma(folio)) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } prep_and_add_bootmem_folios(h, &folio_list); } static void __init gather_bootmem_prealloc_parallel(unsigned long start, unsigned long end, void *arg) { int nid; for (nid = start; nid < end; nid++) gather_bootmem_prealloc_node(nid); } static void __init gather_bootmem_prealloc(void) { struct padata_mt_job job = { .thread_fn = gather_bootmem_prealloc_parallel, .fn_arg = NULL, .start = 0, .size = nr_node_ids, .align = 1, .min_chunk = 1, .max_threads = num_node_state(N_MEMORY), .numa_aware = true, }; padata_do_multithreaded(&job); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h, nid)) break; } else { struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); if (!folio && !list_empty(&folio_list) && hugetlb_vmemmap_optimizable_size(h)) { prep_and_add_allocated_folios(h, &folio_list); INIT_LIST_HEAD(&folio_list); folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); } if (!folio) break; list_add(&folio->lru, &folio_list); } cond_resched(); } if (!list_empty(&folio_list)) prep_and_add_allocated_folios(h, &folio_list); if (i == h->max_huge_pages_node[nid]) return; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); h->max_huge_pages_node[nid] = i; } static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) { int i; bool node_specific_alloc = false; for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; } } return node_specific_alloc; } static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) { if (allocated < h->max_huge_pages) { char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, allocated); h->max_huge_pages = allocated; } } static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) { struct hstate *h = (struct hstate *)arg; int i, num = end - start; nodemask_t node_alloc_noretry; LIST_HEAD(folio_list); int next_node = first_online_node; /* Bit mask controlling how hard we retry per-node allocations.*/ nodes_clear(node_alloc_noretry); for (i = 0; i < num; ++i) { struct folio *folio; if (hugetlb_vmemmap_optimizable_size(h) && (si_mem_available() == 0) && !list_empty(&folio_list)) { prep_and_add_allocated_folios(h, &folio_list); INIT_LIST_HEAD(&folio_list); } folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], &node_alloc_noretry, &next_node); if (!folio) break; list_move(&folio->lru, &folio_list); cond_resched(); } prep_and_add_allocated_folios(h, &folio_list); } static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) { unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; cond_resched(); } return i; } static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) { struct padata_mt_job job = { .fn_arg = h, .align = 1, .numa_aware = true }; unsigned long jiffies_start; unsigned long jiffies_end; unsigned long remaining; job.thread_fn = hugetlb_pages_alloc_boot_node; /* * job.max_threads is 25% of the available cpu threads by default. * * On large servers with terabytes of memory, huge page allocation * can consume a considerably amount of time. * * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. * 2MiB huge pages. Using more threads can significantly improve allocation time. * * +-----------------------+-------+-------+-------+-------+-------+ * | threads | 8 | 16 | 32 | 64 | 128 | * +-----------------------+-------+-------+-------+-------+-------+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | * +-----------------------+-------+-------+-------+-------+-------+ */ if (hugepage_allocation_threads == 0) { hugepage_allocation_threads = num_online_cpus() / 4; hugepage_allocation_threads = max(hugepage_allocation_threads, 1); } job.max_threads = hugepage_allocation_threads; jiffies_start = jiffies; do { remaining = h->max_huge_pages - h->nr_huge_pages; job.start = h->nr_huge_pages; job.size = remaining; job.min_chunk = remaining / hugepage_allocation_threads; padata_do_multithreaded(&job); if (h->nr_huge_pages == h->max_huge_pages) break; /* * Retry only if the vmemmap optimization might have been able to free * some memory back to the system. */ if (!hugetlb_vmemmap_optimizable(h)) break; /* Continue if progress was made in last iteration */ } while (remaining != (h->max_huge_pages - h->nr_huge_pages)); jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", jiffies_to_msecs(jiffies_end - jiffies_start), hugepage_allocation_threads); return h->nr_huge_pages; } /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. * - For gigantic pages, this is called early in the boot process and * pages are allocated from memblock allocated or something similar. * Gigantic pages are actually added to pools later with the routine * gather_bootmem_prealloc. * - For non-gigantic pages, this is called later in the boot process after * all of mm is up and functional. Pages are allocated from buddy and * then added to hugetlb pools. */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; /* * Skip gigantic hugepages allocation if early CMA * reservations are not available. */ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() && !hugetlb_early_cma(h)) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } if (!h->max_huge_pages) return; /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ if (hstate_is_gigantic(h)) allocated = hugetlb_gigantic_pages_alloc_boot(h); else allocated = hugetlb_pages_alloc_boot(h); hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) { struct hstate *h, *h2; for_each_hstate(h) { /* * Always reset to first_memory_node here, even if * next_nid_to_alloc was set before - we can't * reference hugetlb_bootmem_nodes after init, and * first_memory_node is right for all further allocations. */ h->next_nid_to_alloc = first_memory_node; h->next_nid_to_free = first_memory_node; /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); /* * Set demote order for each hstate. Note that * h->demote_order is initially 0. * - We can not demote gigantic pages if runtime freeing * is not supported, so skip this. * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ if (hstate_is_gigantic_no_runtime(h)) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) continue; if (h2->order < h->order && h2->order > h->demote_order) h->demote_order = h2->order; } } } static void __init report_hugepages(void) { struct hstate *h; unsigned long nrinvalid; for_each_hstate(h) { char buf[32]; nrinvalid = hstate_boot_nrinvalid[hstate_index(h)]; h->max_huge_pages -= nrinvalid; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->nr_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", buf, nrinvalid, str_plural(nrinvalid)); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; /* * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; if (folio_test_highmem(folio)) continue; remove_hugetlb_folio(h, folio, false); list_add(&folio->lru, &page_list); } } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { } #endif /* * Increment or decrement surplus_huge_pages. Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { int nr_nodes, node; lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } } else { for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node] < h->nr_huge_pages_node[node]) goto found; } } return 0; found: h->surplus_huge_pages += delta; h->surplus_huge_pages_node[node] += delta; return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* * Bit mask controlling how hard we retry per-node allocations. * If we can not allocate the bit mask, do not attempt to allocate * the requested huge pages. */ if (node_alloc_noretry) nodes_clear(*node_alloc_noretry); else return -ENOMEM; /* * resize_lock mutex prevents concurrent adjustments to number of * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. * Changing node specific huge page count may require a corresponding * change to the global count. In any case, the passed node mask * (nodes_allowed) will restrict alloc/free to the specified node. */ if (nid != NUMA_NO_NODE) { unsigned long old_count = count; count += persistent_huge_pages(h) - (h->nr_huge_pages_node[nid] - h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted * to allocate as many huge pages as possible. Set count to * largest possible value to align with their intention. */ if (count < old_count) count = ULONG_MAX; } /* * Gigantic pages runtime allocation depend on the capability for large * page range allocation. * If the system does not provide this feature, return an error when * the user tries to allocate gigantic pages but let the user free the * boottime allocated gigantic pages. */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ } /* * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } allocated = 0; while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry, &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } list_add(&folio->lru, &page_list); allocated++; /* Bail for signals. Probably ctrl-c from user */ if (signal_pending(current)) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } spin_lock_irq(&hugetlb_lock); } /* Add allocated pages to the pool */ if (!list_empty(&page_list)) { spin_unlock_irq(&hugetlb_lock); prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * Decrease the pool size * First return free pages to the buddy allocator (being careful * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. * * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. * * min_count is the expected number of persistent pages, we * shouldn't calculate min_count by using * resv_huge_pages + persistent_huge_pages() - free_huge_pages, * because there may exist free surplus huge pages, and this will * lead to subtracting twice. Free surplus huge pages come from HVO * failing to restore vmemmap, see comments in the callers of * hugetlb_vmemmap_restore_folio(). Thus, we should calculate * persistent free count first. */ persistent_free_count = h->free_huge_pages; if (h->free_huge_pages > persistent_huge_pages(h)) { if (h->free_huge_pages > h->surplus_huge_pages) persistent_free_count -= h->surplus_huge_pages; else persistent_free_count = 0; } min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); /* * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return 0; } static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, struct list_head *src_list) { long rc; struct folio *folio, *next; LIST_HEAD(dst_list); LIST_HEAD(ret_list); rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ mutex_lock(&dst->resize_lock); list_for_each_entry_safe(folio, next, src_list, lru) { int i; bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; cma = folio_test_hugetlb_cma(folio); list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); /* Careful: see __split_huge_page_tail() */ struct folio *new_folio = (struct folio *)page; clear_compound_head(page); prep_compound_page(page, dst->order); new_folio->mapping = NULL; init_new_hugetlb_folio(new_folio); /* Copy the CMA flag so that it is freed correctly */ if (cma) folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } prep_and_add_allocated_folios(dst, &dst_list); mutex_unlock(&dst->resize_lock); return rc; } long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; struct hstate *dst; long rc = 0; long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } dst = size_to_hstate(PAGE_SIZE << src->demote_order); for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { LIST_HEAD(list); struct folio *folio, *next; list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; remove_hugetlb_folio(src, folio, false); list_add(&folio->lru, &list); if (++nr_demoted == nr_to_demote) break; } spin_unlock_irq(&hugetlb_lock); rc = demote_free_hugetlb_folios(src, dst, &list); spin_lock_irq(&hugetlb_lock); list_for_each_entry_safe(folio, next, &list, lru) { list_del(&folio->lru); add_hugetlb_folio(src, folio, false); nr_demoted--; } if (rc < 0 || nr_demoted == nr_to_demote) break; } /* * Not absolutely necessary, but for consistency update max_huge_pages * based on pool changes for the demoted page. */ src->max_huge_pages -= nr_demoted; dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); if (rc < 0) return rc; if (nr_demoted) return nr_demoted; /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ return -EBUSY; } ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; if (hstate_is_gigantic_no_runtime(h)) return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && init_nodemask_of_mempolicy(&nodes_allowed))) n_mask = &node_states[N_MEMORY]; else n_mask = &nodes_allowed; } else { /* * Node specific request. count adjustment happens in * set_max_huge_pages() after acquiring hugetlb_lock. */ init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } err = set_max_huge_pages(h, count, nid, n_mask); return err ? err : len; } static int __init hugetlb_init(void) { int i; BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; } /* * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some * architectures depend on setup being done here. */ hugetlb_add_hstate(HUGETLB_PAGE_ORDER); if (!parsed_default_hugepagesz) { /* * If we did not parse a default huge page size, set * default_hstate_idx to HPAGE_SIZE hstate. And, if the * number of huge pages for this default size was implicitly * specified, set that here as well. * Note that the implicit setting will overwrite an explicit * setting. A warning will be printed in this case. */ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); if (default_hstate_max_huge_pages) { if (default_hstate.max_huge_pages) { char buf[32]; string_get_size(huge_page_size(&default_hstate), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", default_hstate_max_huge_pages); } default_hstate.max_huge_pages = default_hstate_max_huge_pages; for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } } hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else num_fault_mutexes = 1; #endif hugetlb_fault_mutex_table = kmalloc_array(num_fault_mutexes, sizeof(struct mutex), GFP_KERNEL); BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } subsys_initcall(hugetlb_init); /* Overwritten by architectures with more huge page sizes */ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); WARN_ON(order > MAX_FOLIO_ORDER); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); parsed_hstate = h; } bool __init __weak hugetlb_node_alloc_supported(void) { return true; } static void __init hugepages_clear_pages_in_node(void) { if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, sizeof(parsed_hstate->max_huge_pages_node)); } } static __init int hugetlb_add_param(char *s, int (*setup)(char *)) { size_t len; char *p; if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; len = strlen(s) + 1; if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf)) return -EINVAL; p = &hstate_cmdline_buf[hstate_cmdline_index]; memcpy(p, s, len); hstate_cmdline_index += len; hugetlb_params[hugetlb_param_index].val = p; hugetlb_params[hugetlb_param_index].setup = setup; hugetlb_param_index++; return 0; } static __init void hugetlb_parse_params(void) { int i; struct hugetlb_cmdline *hcp; for (i = 0; i < hugetlb_param_index; i++) { hcp = &hugetlb_params[i]; hcp->setup(hcp->val); } hugetlb_cma_validate_params(); } /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz * specification. If not, ignore the hugepages value. hugepages can also * be the first huge page command line option in which case it implicitly * specifies the number of huge pages for the default size. */ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; int node = NUMA_NO_NODE; int count; unsigned long tmp; char *p = s; if (!hugepages_supported()) { pr_warn("HugeTLB: hugepages unsupported, ignoring hugepages=%s cmdline\n", s); return 0; } if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; return -EINVAL; } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter * yet, so this hugepages= parameter goes to the "default hstate". * Otherwise, it goes with the previously parsed hugepagesz or * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); return 1; } while (*p) { count = 0; if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; /* Parameter is node format */ if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 1; } if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; if (!hugetlb_max_hstate) default_hugepages_in_node[node] = tmp; else parsed_hstate->max_huge_pages_node[node] = tmp; *mhp += tmp; /* Go to parse next node*/ if (p[count] == ',') p += count + 1; else break; } else { if (p != s) goto invalid; *mhp = tmp; break; } } last_mhp = mhp; return 0; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); return -EINVAL; } hugetlb_early_param("hugepages", hugepages_setup); /* * hugepagesz command line processing * A specific huge page size can only be specified once with hugepagesz. * hugepagesz is followed by hugepages on the command line. The global * variable 'parsed_valid_hugepagesz' is used to determine if prior * hugepagesz argument was valid. */ static int __init hugepagesz_setup(char *s) { unsigned long size; struct hstate *h; if (!hugepages_supported()) { pr_warn("HugeTLB: hugepages unsupported, ignoring hugepagesz=%s cmdline\n", s); return 0; } parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); return -EINVAL; } h = size_to_hstate(size); if (h) { /* * hstate for this size already exists. This is normally * an error, but is allowed if the existing hstate is the * default hstate. More specifically, it is only allowed if * the number of huge pages for the default hstate was not * previously specified. */ if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); return -EINVAL; } /* * No need to call hugetlb_add_hstate() as hstate already * exists. But, do set parsed_hstate so that a following * hugepages= parameter will be applied to this hstate. */ parsed_hstate = h; parsed_valid_hugepagesz = true; return 0; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; return 0; } hugetlb_early_param("hugepagesz", hugepagesz_setup); /* * default_hugepagesz command line input * Only one instance of default_hugepagesz allowed on command line. */ static int __init default_hugepagesz_setup(char *s) { unsigned long size; int i; if (!hugepages_supported()) { pr_warn("HugeTLB: hugepages unsupported, ignoring default_hugepagesz=%s cmdline\n", s); return 0; } parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); return -EINVAL; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); return -EINVAL; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; parsed_default_hugepagesz = true; default_hstate_idx = hstate_index(size_to_hstate(size)); /* * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; /* * Since this is an early parameter, we can't check * NUMA node state yet, so loop through MAX_NUMNODES. */ for (i = 0; i < MAX_NUMNODES; i++) { if (default_hugepages_in_node[i] != 0) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } default_hstate_max_huge_pages = 0; } return 0; } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); void __init hugetlb_bootmem_set_nodes(void) { int i, nid; unsigned long start_pfn, end_pfn; if (!nodes_empty(hugetlb_bootmem_nodes)) return; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { if (end_pfn > start_pfn) node_set(nid, hugetlb_bootmem_nodes); } } void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; hugetlb_bootmem_set_nodes(); for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); hugetlb_parse_params(); for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } } /* * hugepage_alloc_threads command line parsing. * * When set, use this specific number of threads for the boot * allocation of hugepages. */ static int __init hugepage_alloc_threads_setup(char *s) { unsigned long allocation_threads; if (kstrtoul(s, 0, &allocation_threads) != 0) return 1; if (allocation_threads == 0) return 1; hugepage_allocation_threads = allocation_threads; return 1; } __setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup); static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } return nr; } void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; unsigned long total = 0; if (!hugepages_supported()) return; for_each_hstate(h) { unsigned long count = h->nr_huge_pages; total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %8lu kB\n", count, h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, huge_page_size(h) / SZ_1K); } seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) { struct hstate *h = &default_hstate; if (!hugepages_supported()) return 0; return sysfs_emit_at(buf, len, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", nid, h->nr_huge_pages_node[nid], nid, h->free_huge_pages_node[nid], nid, h->surplus_huge_pages_node[nid]); } void hugetlb_show_meminfo_node(int nid) { struct hstate *h; if (!hugepages_supported()) return; for_each_hstate(h) printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", nid, h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { struct hstate *h; unsigned long nr_total_pages = 0; for_each_hstate(h) nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; if (!delta) return 0; spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such * reservation is completely rubbish in the presence of cpuset because * the reservation is not checked against page availability for the * current cpuset. Application can still potentially OOM'ed by kernel * with lack of free htlb page in cpuset that the task is in. * Attempt to enforce strict accounting with cpuset is almost * impossible (or too ugly) because cpuset is too fluid that * task or memory node can be dynamically moved between cpusets. * * The change of semantics for shared hugetlb mapping with cpuset is * undesirable. However, in order to preserve some of the semantics, * we fall back to check against current free page availability as * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. * * Apart from cpuset, we also have memory policy mechanism that * also determines from which node the kernel will allocate memory * in a NUMA system. So similar to cpuset, we also should consider * the memory policy of the current task. Similar to the description * above. */ if (delta > 0) { if (gather_surplus_pages(h, delta) < 0) goto out; if (delta > allowed_mems_nr(h)) { return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock_irq(&hugetlb_lock); return ret; } static void hugetlb_vm_op_open(struct vm_area_struct *vma) { struct resv_map *resv = vma_resv_map(vma); /* * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } /* * vma_lock structure for sharable mappings is vma specific. * Clear old pointer (if copied via vm_area_dup) and allocate * new structure. Before clearing, make sure vma_lock is not * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock) { if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); } else { pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); } } else { hugetlb_vma_lock_alloc(vma); } } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; hugetlb_vma_lock_free(vma); resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(resv, start, end); hugetlb_cgroup_uncharge_counter(resv, start, end); if (reserve) { /* * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ gbl_reserve = hugepage_subpool_put_pages(spool, reserve); hugetlb_acct_memory(h, -gbl_reserve); } kref_put(&resv->refs, resv_map_release); } static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; return 0; } void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) { /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. * This function is called in the middle of a VMA split operation, with * MM, VMA and rmap all write-locked to prevent concurrent page table * walks (except hardware and gup_fast()). */ vma_assert_write_locked(vma); i_mmap_assert_write_locked(vma->vm_file->f_mapping); if (addr & ~PUD_MASK) { unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; if (floor >= vma->vm_start && ceil <= vma->vm_end) { /* * Locking: * Use take_locks=false here. * The file rmap lock is already held. * The hugetlb VMA lock can't be taken when we already * hold the file rmap lock, and we don't need it because * its purpose is to synchronize against concurrent page * table walks, which are not possible thanks to the * locks held by our caller. */ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); } } } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { return huge_page_size(hstate_vma(vma)); } /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; } /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. * This is because under System V memory model, mappings created via * shmget/shmat with "huge page" specified are backed by hugetlbfs files, * their original vm_ops are overwritten with shm_vm_ops. */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, bool try_mkwrite) { pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } static void set_huge_ptep_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t entry; entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (vma->vm_flags & VM_WRITE) set_huge_ptep_writable(vma, address, ptep); } static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; softleaf_t softleaf; int ret = 0; if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings the vma lock must be held before * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; } #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING /* If the pagetables are shared, there is nothing to do */ if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } #endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { /* Skip if src entry none. */ goto next; } softleaf = softleaf_from_pte(entry); if (unlikely(softleaf_is_hwpoison(softleaf))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); if (!softleaf_is_migration_read(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ softleaf = make_readable_migration_entry( swp_offset(softleaf)); entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(pte_is_marker(entry))) { const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case * where we see pinned hugetlb pages while they're * prone to COW. We need to do the COW earlier during * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ if (!folio_test_anon(pte_folio)) { hugetlb_add_file_rmap(pte_folio); } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, pte_folio, addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); break; } /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); goto next; } if (cow) { /* * No need to notify as we are downgrading page * table protection not changing it to point * to a new page. * * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } next: spin_unlock(src_ptl); spin_unlock(dst_ptl); } if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { hugetlb_vma_unlock_read(src_vma); } return ret; } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; pte_t pte; dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); /* * We don't have to worry about the ordering of src and dst ptlocks * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. */ if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) { huge_pte_clear(mm, new_addr, dst_pte, sz); } else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); else pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); } if (src_ptl != dst_ptl) spin_unlock(src_ptl); spin_unlock(dst_ptl); } int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { struct hstate *h = hstate_vma(vma); struct address_space *mapping = vma->vm_file->f_mapping; unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* * In case of shared PMDs, we should cover the maximum possible * range. */ flush_cache_range(vma, range.start, range.end); tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); } tlb_flush_mmu_tlbonly(&tlb); huge_pmd_unshare_flush(&tlb, vma); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); tlb_finish_mmu(&tlb); return len + old_addr - old_end; } void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; const bool folio_provided = !!folio; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); /* * This is a hugetlb vma, all the pte entries should point * to huge page. */ tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; } /* * Migrating hugepage or HWPoisoned hugepage is already * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to * drop the uffd-wp bit in this zap, then replace the * pte with a marker. */ if (pte_swp_uffd_wp_any(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); else huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } /* * If a folio is supplied, it is because a specific * folio is being unmapped, not a range. Ensure the folio we * are about to unmap is the actual folio of interest. */ if (folio_provided) { if (folio != page_folio(pte_page(pte))) { spin_unlock(ptl); continue; } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } else { folio = page_folio(pte_page(pte)); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the * backing page could be stolen by someone. * If there we are freeing a surplus, do not set the restore * reservation bit. */ adjust_reservation = false; spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the * reserve restored. Keep in mind that vma_needs_reservation() changes * resv->adds_in_progress if it succeeds. If this is not done, * do_exit() will not see it, and will keep the reservation * forever. */ if (adjust_reservation) { int rc = vma_needs_reservation(h, vma, address); if (rc < 0) /* Pressumably allocate_file_region_entries failed * to allocate a file_region struct. Clear * hugetlb_restore_reserve so that global reserve * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ folio_clear_hugetlb_restore_reserve(folio); else if (rc) vma_add_reservation(h, vma, address); } tlb_remove_page_size(tlb, folio_page(folio, 0), folio_size(folio)); /* * If we were instructed to unmap a specific folio, we're done. */ if (folio_provided) break; } tlb_end_vma(tlb, vma); huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; adjust_range_if_pmd_sharing_possible(vma, start, end); hugetlb_vma_lock_write(vma); if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); } void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { zap_flags_t zap_flags = details ? details->zap_flags : 0; if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* * Unlock and free the vma lock before releasing i_mmap_rwsem. * When the vma_lock is freed, this makes the vma ineligible * for pmd sharing. And, i_mmap_rwsem is required to set up * pmd sharing. This is important as page tables for this * unmapped range will be asynchrously deleted. If the page * tables are shared, there will be issues when accessed by * someone else. */ __hugetlb_vma_unlock_write_free(vma); } else { hugetlb_vma_unlock_write(vma); } if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, folio, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } /* * This is called when the original mapper is failing to COW a MAP_PRIVATE * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct folio *folio, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; pgoff_t pgoff; /* * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; mapping = vma->vm_file->f_mapping; /* * Take the mapping lock for the duration of the table walk. As * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; /* * Shared VMAs have their own reserves and do not affect * MAP_PRIVATE accounting but it is possible that a shared * VMA is using the same page so check and skip such VMAs. */ if (iter_vma->vm_flags & VM_MAYSHARE) continue; /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these * areas. This is because a future no-page fault on this VMA * could insert a zeroed page instead of the data existing * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), folio, 0); } i_mmap_unlock_write(mapping); } /* * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ static vm_fault_t hugetlb_wp(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; /* * Never handle CoW for uffd-wp protected pages. It should be only * handled when the uffd-wp protection is removed. * * Note that only the CoW optimization path (in hugetlb_no_page()) * can trigger this, because hugetlb_fault() will always resolve * uffd-wp bit first. */ if (!unshare && huge_pte_uffd_wp(pte)) return 0; /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. * * Note that we don't rely on the (safer) folio refcount here, because * copying the hugetlb folio when there are unexpected (temporary) * folio references could harm simple fork()+exit() users when * we run out of free hugetlb folios: we would have to kill processes * in scenarios that used to work. As a side effect, there can still * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) set_huge_ptep_maybe_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; } VM_BUG_ON_PAGE(folio_test_anon(old_folio) && PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to perform * a COW due to a shared page count, attempt to satisfy the allocation * without using the existing reserves. * In order to determine where this is a COW on a MAP_PRIVATE mapping it * is enough to check whether the old_folio is anonymous. This means that * the reserve for this address was consumed. If reserves were used, a * partial faulted mapping at the fime of fork() could consume its reserves * on COW instead of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && folio_test_anon(old_folio)) cow_from_owner = true; folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient * huge page pool. To guarantee the original mappers * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * * Reacquire both after unmap operation. */ idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); unmap_ref_private(mm, vma, old_folio, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table * lock, and our job is done. */ delayacct_wpcopy_end(); return 0; } ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } /* * When the original hugepage is shared one, it does not have * anon_vma prepared. */ ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out_release_all; if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* * No restore in case of successful pagetable update (Break COW or * unshare) */ if (new_folio != old_folio) restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; } /* * Return whether there is a pagecache page to back given address within VMA. */ bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); if (IS_ERR(folio)) return false; folio_put(folio); return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); if (unlikely(err)) { __folio_clear_locked(folio); return err; } folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); return 0; } static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, unsigned long reason) { u32 hash; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vmf->vma); hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(vmf, reason); } /* * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte); spin_unlock(ptl); return same; } static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); bool new_folio, new_anon_folio = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; bool folio_locked = true; struct folio *folio; unsigned long size; pte_t new_pte; /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed * COW/unsharing. Warn that such a situation has occurred as it may not * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); goto out; } /* * Use page lock to guard against racing truncation * before we get page_table_lock. */ new_folio = false; folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { /* * Since hugetlb_no_page() was examining pte * without pgtable lock, we need to re-test under * lock because the pte may not be stable and could * have changed from under us. Try to detect * either changed or during-changing ptes and retry * properly when needed. * * Note that userfaultfd is actually fine with * false positives (e.g. caused by pte changed), * but not wrong logical events (e.g. caused by * reading a pte during changing). The latter can * confuse the userspace, so the strictness is very * much preferred. E.g., MISSING event should * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } if (!(vma->vm_flags & VM_MAYSHARE)) { ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; } folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two * tasks from racing to fault in the same page which * could result in false unable to allocate errors. * Page migration does not take the fault mutex, but * does a clear then write of pte's under page table * lock. Page fault code could race with migration, * notice the clear pte and try to allocate a page * here. Before returning error, get ptl and make * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } folio_zero_user(folio, vmf->real_address); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(folio, mapping, vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone * else consumed the reservation since hugetlb * fault mutex is held when add a hugetlb page * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ restore_reserve_on_error(h, vma, vmf->address, folio); folio_put(folio); ret = VM_FAULT_SIGBUS; goto out; } } else { new_anon_folio = true; folio_lock(folio); } } else { /* * If memory error occurs between mmap() and fault, some process * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside * the spinlock. */ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf->address); } vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; if (new_anon_folio) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* * No need to keep file folios locked. See comment in * hugetlb_fault(). */ if (!new_anon_folio) { folio_locked = false; folio_unlock(folio); } /* Optimization, do the COW without a second fault */ ret = hugetlb_wp(vmf); } spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ if (new_folio) folio_set_hugetlb_migratable(folio); if (folio_locked) folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() is * the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(vmf->ptl); backout_unlocked: /* We only need to restore reservations for private mappings */ if (new_anon_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); goto out; } #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; key[0] = (unsigned long) mapping; key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } #else /* * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } #endif vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { vm_fault_t ret; u32 hash; struct folio *folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; bool need_wait_lock = false; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), .real_address = address, .flags = flags, .pgoff = vma_hugecache_offset(h, vma, address & huge_page_mask(h)), /* TODO: Track hugetlb faults using vm_fault */ /* * Some fields may not be initialized, be careful as it may * be hard to debug if called functions make assumptions */ }; /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold * until finished with vmf.pte. This prevents huge_pmd_unshare from * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); if (huge_pte_none(vmf.orig_pte)) /* * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); if (pte_is_marker(vmf.orig_pte)) { const pte_marker marker = softleaf_to_marker(softleaf_from_pte(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { /* This isn't supported in hugetlb. */ ret = VM_FAULT_SIGSEGV; goto out_mutex; } return hugetlb_no_page(mapping, &vmf); } ret = 0; /* Not present, either a migration or a hwpoisoned entry */ if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) { const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte); if (softleaf_is_migration(softleaf)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the * huge_pte_lockptr() later in * migration_entry_wait_huge(). The vma lock will * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; } if (softleaf_is_hwpoison(softleaf)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } goto out_mutex; } /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { /* * Anonymous folios need to be lock since hugetlb_wp() * checks whether we can re-use the folio exclusively * for us in case we are the only user of it. */ folio = page_folio(pte_page(vmf.orig_pte)); if (folio_test_anon(folio) && !folio_trylock(folio)) { need_wait_lock = true; goto out_ptl; } folio_get(folio); ret = hugetlb_wp(&vmf); if (folio_test_anon(folio)) folio_unlock(folio); folio_put(folio); goto out_ptl; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } vmf.orig_pte = pte_mkyoung(vmf.orig_pte); if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); out_ptl: spin_unlock(vmf.ptl); out_mutex: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() in * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * hugetlb_wp drops all the locks, but the folio lock, before trying to * unmap the folio from other processes. During that window, if another * process mapping that folio faults in, it will take the mutex and then * it will wait on folio_lock, causing an ABBA deadlock. * Use trylock instead and bail out if we fail. * * Ideally, we should hold a refcount on the folio we wait for, but we do * not want to use the folio after it becomes unlocked, but rather just * wait for it to become unlocked, so hopefully next fault successes on * the trylock. */ if (need_wait_lock) folio_wait_locked(folio); return ret; } #ifdef CONFIG_USERFAULTFD /* * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). */ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); /* * This is used to allocate a temporary hugetlb to hold the copied * content, which will then be copied again to the final hugetlb * consuming a reservation. Set the alloc_fallback to false to indicate * that breaking the per-node hugetlb pool is not allowed in this case. */ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; } /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. */ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct mm_struct *dst_mm = dst_vma->vm_mm; bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); bool wp_enabled = (flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size = huge_page_size(h); int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; pte_t dst_ptep; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { spin_unlock(ptl); return -EEXIST; } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); return 0; } if (is_continue) { ret = -EFAULT; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*foliop) { /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { ret = -EEXIST; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE); if (actual_pte) { ret = -EEXIST; goto out; } ret = -ENOMEM; goto out; } ret = copy_folio_from_user(folio, (const void __user *) src_addr, false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; /* Free the allocated folio which may have * consumed a reservation. */ restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied * contents. */ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); if (!folio) { ret = -ENOMEM; goto out; } *foliop = folio; /* Set the outparam foliop and return to the caller to * copy the contents outside the lock. Don't free the * folio. */ goto out; } } else { if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { folio_put(*foliop); ret = -EEXIST; *foliop = NULL; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; *foliop = NULL; goto out; } ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { folio_put(folio); goto out; } } /* * If we just allocated a new page, we need a memory barrier to ensure * that preceding stores to the page become visible before the * set_pte_at() write. The memory barrier inside __folio_mark_uptodate * is what we need. * * In the case where we have not allocated a new page (is_continue), * the page must already be uptodate. UFFDIO_CONTINUE already includes * an earlier smp_wmb() to ensure that prior stores will be visible * before the set_pte_at() write. */ if (!is_continue) __folio_mark_uptodate(folio); else WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { ret = -EFAULT; if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h))) goto out_release_nounlock; /* * Serialization between remove_inode_hugepages() and * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; if (folio_test_hwpoison(folio)) goto out_release_unlock; ret = -EEXIST; dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); /* * See comment about UFFD marker overwriting in * mfill_atomic_install_pte(). */ if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) hugetlb_add_file_rmap(folio); else hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not * supported, but we should still be clear in that this page cannot be * thrown away at will, even if write bit not set. */ _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); if (!is_continue) folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; struct mmu_gather tlb; /* * In the case of shared PMDs, the area to flush could be beyond * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { softleaf_t entry; spinlock_t *ptl; ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; continue; } /* * Userfaultfd wr-protect requires pgtable * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); if (!ptep) { pages = -ENOMEM; break; } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(&tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it * happened due to some reason. */ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { if (unlikely(uffd_wp)) /* Safe to modify directly (none->non-present). */ set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); goto next; } entry = softleaf_from_pte(pte); if (unlikely(softleaf_is_hwpoison(entry))) { /* Nothing to do. */ } else if (unlikely(softleaf_is_migration(entry))) { struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); pages++; } if (uffd_wp) newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(pte_is_marker(pte))) { /* * Do nothing on a poison marker; page is * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } tlb_flush_mmu_tlbonly(&tlb); huge_pmd_unshare_flush(&tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new * page. * * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return pages > 0 ? (pages << h->order) : pages; } /* * Update the reservation map for the range [from, to]. * * Returns the number of entries that would be added to the reservation map * associated with the range [from, to]. This number is greater or equal to * zero. -EINVAL or -ENOMEM is returned in case of any errors. */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_desc *desc, vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; int err; /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); return -EINVAL; } /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ if (vm_flags & VM_NORESERVE) return 0; /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see * hugetlbfs_get_inode). */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) { err = -ENOMEM; goto out_err; } chg = to - from; set_vma_desc_resv_map(desc, resv_map); set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) { /* region_chg() above can return -ENOMEM */ err = (chg == -ENOMEM) ? -ENOMEM : -EINVAL; goto out_err; } err = hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), &h_cg); if (err < 0) goto out_err; if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); } /* * There must be enough pages in the subpool for the mapping. If * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); if (gbl_reserve < 0) { err = gbl_reserve; goto out_uncharge_cgroup; } /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ err = hugetlb_acct_memory(h, gbl_reserve); if (err < 0) goto out_put_pages; /* * Account for the reservations made. Shared mappings record regions * that have reservations as they are shared by multiple VMAs. * When the last VMA disappears, the region map says how much * the reservation was and the page cache tells how much of * the reservation was consumed. Private mappings are per-VMA and * only the consumed reservations are tracked. When the VMA * disappears, the original reservation is the VMA size and the * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); err = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* * pages in this range were added to the reserve * map between region_chg and region_add. This * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ long rsv_adjust; /* * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the * reference to h_cg->css. See comment below for detail. */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); } else if (h_cg) { /* * The file_regions will hold their own reference to * h_cg->css. So we should release the reference held * via hugetlb_cgroup_charge_cgroup_rsvd() when we are * done. */ hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return chg; out_put_pages: spool_resv = chg - gbl_reserve; if (spool_resv) { /* put sub pool's reservation back, chg - gbl_reserve */ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); /* * subpool's reserved pages can not be put back due to race, * return to hstate. */ hugetlb_acct_memory(h, -gbl_resv); } /* Restore used_hpages for pages that failed global reservation */ if (gbl_reserve && spool) { unsigned long flags; spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) spool->used_hpages -= gbl_reserve; unlock_or_release_subpool(spool, flags); } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); set_vma_desc_resv_map(desc, NULL); } return err; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; /* * Since this routine can be called in the evict inode path for all * hugetlbfs inodes, resv_map could be NULL. */ if (resv_map) { chg = region_del(resv_map, start, end); /* * region_del() can fail in the rare case where a region * must be split and another region descriptor can not be * allocated. If end == LONG_MAX, it will not fail. */ if (chg < 0) return chg; } spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. * * Note that !resv_map implies freed == 0. So (chg - freed) * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); return 0; } #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) { unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + svma->vm_start; unsigned long sbase = saddr & PUD_MASK; unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the * page table page. * * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || !range_in_vma(svma, sbase, s_end) || !svma->vm_private_data) return 0; return saddr; } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif /* * check on proper vm_flags and page table alignment */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible * shared pmd mappings. */ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* * vma needs to span at least one aligned PUD size, and the range * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ if (*start > v_start) *start = ALIGN_DOWN(*start, PUD_SIZE); if (*end < v_end) *end = ALIGN(*end, PUD_SIZE); } /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; pte_t *pte; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } } if (!spte) goto out; spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); return pte; } /** * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users * @tlb: the current mmu_gather. * @vma: the vma covering the pmd table. * @addr: the address we are trying to unshare. * @ptep: pointer into the (pmd) page table. * * Called with the page table lock held, the i_mmap_rwsem held in write mode * and the hugetlb vma lock held in write mode. * * Note: The caller must call huge_pmd_unshare_flush() before dropping the * i_mmap_rwsem. * * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it * was not a shared PMD table. */ int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); if (sz != PMD_SIZE) return 0; if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); mm_dec_nr_pmds(mm); return 1; } /* * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls * @tlb: the current mmu_gather. * @vma: the vma covering the pmd table. * * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table * unsharing with concurrent page table walkers. * * This function must be called after a sequence of huge_pmd_unshare() * calls while still holding the i_mmap_rwsem. */ void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * We must synchronize page table unsharing such that nobody will * try reusing a previously-shared page table while it might still * be in use by previous sharers (TLB, GUP_fast). */ i_mmap_assert_write_locked(vma->vm_file->f_mapping); tlb_flush_unshared_tables(tlb); } #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { return NULL; } int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) { } void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } } if (pte) { pte_t pteval = ptep_get_lockless(pte); BUG_ON(pte_present(pteval) && !pte_huge(pteval)); } return pte; } /* * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * * Return: Pointer to page table entry (PUD or PMD) for * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, addr); if (sz == PUD_SIZE) /* must be pud huge, non-present or none */ return (pte_t *)pud; if (!pud_present(*pud)) return NULL; /* must have a valid entry and size to go further */ pmd = pmd_offset(pud, addr); /* must be pmd huge, non-present or none */ return (pte_t *)pmd; } /* * Return a mask that can be used to update an address to the last huge * page in a page table page mapping size. Used to skip non-present * page table entries when linearly scanning address ranges. Architectures * with unique huge page to page table relationships can define their own * version of this routine. */ unsigned long hugetlb_mask_last_page(struct hstate *h) { unsigned long hp_size = huge_page_size(h); if (hp_size == PUD_SIZE) return P4D_SIZE - PUD_SIZE; else if (hp_size == PMD_SIZE) return PUD_SIZE - PMD_SIZE; else return 0UL; } #else /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif return 0UL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /** * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio * @folio: the folio to isolate * @list: the list to add the folio to on success * * Isolate an allocated (refcount > 0) hugetlb folio, marking it as * isolated/non-migratable, and moving it from the active list to the * given list. * * Isolation will fail if @folio is not an allocated hugetlb folio, or if * it is already isolated/non-migratable. * * On success, an additional folio reference is taken that must be dropped * using folio_putback_hugetlb() to undo the isolation. * * Return: True if isolation worked, otherwise False. */ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; } int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { *hugetlb = true; if (folio_test_hugetlb_freed(folio)) ret = 0; else if (folio_test_hugetlb_migratable(folio) || unpoison) ret = folio_try_get(folio); else ret = -EBUSY; } spin_unlock_irq(&hugetlb_lock); return ret; } int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } /** * folio_putback_hugetlb - unisolate a hugetlb folio * @folio: the isolated hugetlb folio * * Putback/un-isolate the hugetlb folio that was previous isolated using * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it * back onto the active list. * * Will drop the additional folio reference obtained through * folio_isolate_hugetlb(). */ void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); folio_set_owner_migrate_reason(new_folio, reason); /* * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. * * Also note that we have to transfer the per-node surplus state * here as well otherwise the global surplus count will not match * the per-node's. */ if (folio_test_hugetlb_temporary(new_folio)) { int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); /* * There is no need to transfer the per-node surplus state * when we do not cross the node. */ if (new_nid == old_nid) return; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } spin_unlock_irq(&hugetlb_lock); } /* * Our old folio is isolated and has "migratable" cleared until it * is putback. As migration succeeded, set the new folio "migratable" * and add it to the active list. */ spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(new_folio); list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } /* * If @take_locks is false, the caller must ensure that no concurrent page table * access can happen (except for gup_fast() and hardware page walks). * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like * concurrent page fault handling) and the file rmap lock. */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; if (start >= end) return; flush_cache_range(vma, start, end); tlb_gather_mmu_vma(&tlb, vma); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); if (take_locks) { hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); } else { i_mmap_assert_write_locked(vma->vm_file->f_mapping); } for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); } /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } /* * This function will unconditionally remove all the shared pmd pgtable entries * within the specific vma for a hugetlbfs memory range. */ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE), /* take_locks = */ true); } /* * For hugetlb, mremap() is an odd edge case - while the VMA copying is * performed, we permit both the old and new VMAs to reference the same * reservation. * * We fix this up after the operation succeeds, or if a newly allocated VMA * is closed as a result of a failure to allocate memory. */ void fixup_hugetlb_reservations(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) clear_vma_resv_huge_pages(vma); } |
| 5 195 1 93 70 1909 1913 80 4170 208 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be * applied to the leaves of pgtables. */ #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif /* * Page migration support. * * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and * indicates that the referenced (part of) an anonymous page is exclusive to * a single process. For SWP_MIGRATION_WRITE, that information is implicit: * (part of) an anonymous page that are mapped writable are exclusive to a * single process. */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 3 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; /* * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based * reclaim * @pages: number of pages reclaimed * * If the current process is undergoing a reclaim operation, increment the * number of reclaimed pages by @pages. */ static inline void mm_account_reclaimed_pages(unsigned long pages) { if (current->reclaim_state) current->reclaim_state->reclaimed += pages; } #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ }; #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as * a sentinel to indicate an entry is not valid. */ #define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) #else #define SWAP_NR_ORDERS 1 #endif /* * We keep using same cluster for rotational device so IO will be sequential. * The purpose is to optimize SWAP throughput on these device. */ struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like * swap_map, inuse_pages and all cluster * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); static inline bool folio_may_be_lru_cached(struct folio *folio) { /* * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. * Holding small numbers of low-order mTHP folios in per-CPU LRU cache * will be sensible, but nobody has implemented and tested that yet. */ return !folio_test_large(folio); } extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) { return atomic_read(&lru_disable_count); } static inline void lru_cache_enable(void) { atomic_dec(&lru_disable_count); } extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 /* Just reclaim from anon folios in proactive memory reclaim */ #define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) extern int reclaim_register_node(struct node *node); extern void reclaim_unregister_node(struct node *node); #else static inline int reclaim_register_node(struct node *node) { return 0; } static inline void reclaim_unregister_node(struct node *node) { } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ #ifdef CONFIG_NUMA extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #endif void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } void free_swap_cache(struct folio *folio); void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); /* * If there is an existing swap slot reference (swap entry) and the caller * guarantees that there is no race modification of it (e.g., PTL * protecting the swap entry in page table; shmem's cmpxchg protects t * he swap entry in shmem mapping), these two helpers below can be used * to put/dup the entries directly. * * All entries must be allocated by folio_alloc_swap(). And they must have * a swap count > 1. See comments of folio_*_swap helpers for more info. */ int swap_dup_entry_direct(swp_entry_t entry); void swap_put_entries_direct(swp_entry_t entry, int nr); /* * folio_free_swap tries to free the swap entries pinned by a swap cache * folio, it has to be here to be called by other components. */ bool folio_free_swap(struct folio *folio); /* Allocate / free (hibernation) exclusive entries */ swp_entry_t swap_alloc_hibernation_slot(int type); void swap_free_hibernation_slot(swp_entry_t entry); static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; } static inline void put_swap_device(struct swap_info_struct *si) { } #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) #define free_folio_and_swap_cache(folio) \ folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); static inline void free_swap_cache(struct folio *folio) { } static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { return false; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } static inline bool folio_free_swap(struct folio *folio) { return false; } static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { return -EINVAL; } #endif /* CONFIG_SWAP */ #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return READ_ONCE(vm_swappiness); return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return READ_ONCE(vm_swappiness); } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; __folio_throttle_swaprate(folio, gfp); } #else static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { } #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_swap(entry, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ |
| 3 545 547 3 3 3 60 61 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | // SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { ASSERT_RTNL(); info->family = AF_INET; /* Paired with READ_ONCE() in fib4_seq_read() */ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib4_rules_dump(net, nb, extack); if (err) return err; return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { .family = AF_INET, .fib_seq_read = fib4_seq_read, .fib_dump = fib4_dump, .owner = THIS_MODULE, }; int __net_init fib4_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.fib_seq = 0; ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.notifier_ops = ops; return 0; } void __net_exit fib4_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.notifier_ops); } |
| 7 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | // SPDX-License-Identifier: GPL-2.0-only /* * iptables module for DCCP protocol header matching * * (C) 2005 by Harald Welte <laforge@netfilter.org> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/ip.h> #include <linux/dccp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_dccp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); MODULE_ALIAS("ipt_dccp"); MODULE_ALIAS("ip6t_dccp"); #define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) static unsigned char *dccp_optbuf; static DEFINE_SPINLOCK(dccp_buflock); static inline bool dccp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const unsigned char *op; unsigned int optoff = __dccp_hdr_len(dh); unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); unsigned int i; if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) goto invalid; if (!optlen) return false; spin_lock_bh(&dccp_buflock); op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); if (op == NULL) { /* If we don't have the whole header, drop packet. */ goto partial; } for (i = 0; i < optlen; ) { if (op[i] == option) { spin_unlock_bh(&dccp_buflock); return true; } if (op[i] < 2) i++; else i += op[i+1]?:1; } spin_unlock_bh(&dccp_buflock); return false; partial: spin_unlock_bh(&dccp_buflock); invalid: *hotdrop = true; return false; } static inline bool match_types(const struct dccp_hdr *dh, u_int16_t typemask) { return typemask & (1 << dh->dccph_type); } static inline bool match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, bool *hotdrop) { return dccp_find_option(option, skb, protoff, dh, hotdrop); } static bool dccp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_dccp_info *info = par->matchinfo; const struct dccp_hdr *dh; struct dccp_hdr _dh; if (par->fragoff != 0) return false; dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); if (dh == NULL) { par->hotdrop = true; return false; } return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0] && ntohs(dh->dccph_sport) <= info->spts[1], XT_DCCP_SRC_PORTS, info->flags, info->invflags) && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0] && ntohs(dh->dccph_dport) <= info->dpts[1], XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), XT_DCCP_TYPE, info->flags, info->invflags) && DCCHECK(match_option(info->option, skb, par->thoff, dh, &par->hotdrop), XT_DCCP_OPTION, info->flags, info->invflags); } static int dccp_mt_check(const struct xt_mtchk_param *par) { const struct xt_dccp_info *info = par->matchinfo; if (info->flags & ~XT_DCCP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_DCCP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~info->flags) return -EINVAL; return 0; } static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", .family = NFPROTO_IPV4, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, { .name = "dccp", .family = NFPROTO_IPV6, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, }; static int __init dccp_mt_init(void) { int ret; /* doff is 8 bits, so the maximum option size is (4*256). Don't put * this in BSS since DaveM is worried about locked TLB's for kernel * BSS. */ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); if (!dccp_optbuf) return -ENOMEM; ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); if (ret) goto out_kfree; return ret; out_kfree: kfree(dccp_optbuf); return ret; } static void __exit dccp_mt_exit(void) { xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); kfree(dccp_optbuf); } module_init(dccp_mt_init); module_exit(dccp_mt_exit); |
| 61 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/snmp.h> #include <net/tls.h> #include "tls.h" #ifdef CONFIG_PROC_FS static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), SNMP_MIB_ITEM("TlsRxRekeyOk", LINUX_MIB_TLSRXREKEYOK), SNMP_MIB_ITEM("TlsRxRekeyError", LINUX_MIB_TLSRXREKEYERROR), SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { unsigned long buf[ARRAY_SIZE(tls_mib_list)]; const int cnt = ARRAY_SIZE(tls_mib_list); struct net *net = seq->private; int i; memset(buf, 0, sizeof(buf)); snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt, net->mib.tls_statistics); for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; } #endif int __net_init tls_proc_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net_single("tls_stat", 0444, net->proc_net, tls_statistics_seq_show, NULL)) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } void __net_exit tls_proc_fini(struct net *net) { remove_proc_entry("tls_stat", net->proc_net); } |
| 1 3 4 2 2 2 3 2 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 | // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (line6@grabner-graz.at) */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/initval.h> #include <sound/hwdep.h> #include "capture.h" #include "driver.h" #include "midi.h" #include "playback.h" #define DRIVER_AUTHOR "Markus Grabner <line6@grabner-graz.at>" #define DRIVER_DESC "Line 6 USB Driver" /* This is Line 6's MIDI manufacturer ID. */ const unsigned char line6_midi_id[3] = { 0x00, 0x01, 0x0c }; EXPORT_SYMBOL_GPL(line6_midi_id); /* Code to request version of POD, Variax interface (and maybe other devices). */ static const char line6_request_version[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x01, 0xf7 }; /* Class for asynchronous messages. */ struct message { struct usb_line6 *line6; const char *buffer; int size; int done; }; /* Forward declarations. */ static void line6_data_received(struct urb *urb); static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb); /* Start to listen on endpoint. */ static int line6_start_listen(struct usb_line6 *line6) { int err; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(line6->urb_listen, line6->usbdev, usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6, line6->interval); } else { usb_fill_bulk_urb(line6->urb_listen, line6->usbdev, usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r), line6->buffer_listen, LINE6_BUFSIZE_LISTEN, line6_data_received, line6); } /* sanity checks of EP before actually submitting */ if (usb_urb_ep_type_check(line6->urb_listen)) { dev_err(line6->ifcdev, "invalid control EP\n"); return -EINVAL; } line6->urb_listen->actual_length = 0; err = usb_submit_urb(line6->urb_listen, GFP_ATOMIC); return err; } /* Stop listening on endpoint. */ static void line6_stop_listen(struct usb_line6 *line6) { usb_kill_urb(line6->urb_listen); } /* Send raw message in pieces of wMaxPacketSize bytes. */ int line6_send_raw_message(struct usb_line6 *line6, const char *buffer, int size) { int i, done = 0; const struct line6_properties *properties = line6->properties; for (i = 0; i < size; i += line6->max_packet_size) { int partial; const char *frag_buf = buffer + i; int frag_size = min(line6->max_packet_size, size - i); int retval; if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { retval = usb_interrupt_msg(line6->usbdev, usb_sndintpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } else { retval = usb_bulk_msg(line6->usbdev, usb_sndbulkpipe(line6->usbdev, properties->ep_ctrl_w), (char *)frag_buf, frag_size, &partial, LINE6_TIMEOUT); } if (retval) { dev_err(line6->ifcdev, "usb_bulk_msg failed (%d)\n", retval); break; } done += frag_size; } return done; } EXPORT_SYMBOL_GPL(line6_send_raw_message); /* Notification of completion of asynchronous request transmission. */ static void line6_async_request_sent(struct urb *urb) { struct message *msg = (struct message *)urb->context; if (msg->done >= msg->size) { usb_free_urb(urb); kfree(msg); } else line6_send_raw_message_async_part(msg, urb); } /* Asynchronously send part of a raw message. */ static int line6_send_raw_message_async_part(struct message *msg, struct urb *urb) { int retval; struct usb_line6 *line6 = msg->line6; int done = msg->done; int bytes = min(msg->size - done, line6->max_packet_size); if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { usb_fill_int_urb(urb, line6->usbdev, usb_sndintpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg, line6->interval); } else { usb_fill_bulk_urb(urb, line6->usbdev, usb_sndbulkpipe(line6->usbdev, line6->properties->ep_ctrl_w), (char *)msg->buffer + done, bytes, line6_async_request_sent, msg); } msg->done += bytes; /* sanity checks of EP before actually submitting */ retval = usb_urb_ep_type_check(urb); if (retval < 0) goto error; retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval < 0) goto error; return 0; error: dev_err(line6->ifcdev, "%s: usb_submit_urb failed (%d)\n", __func__, retval); usb_free_urb(urb); kfree(msg); return retval; } /* Asynchronously send raw message. */ int line6_send_raw_message_async(struct usb_line6 *line6, const char *buffer, int size) { struct message *msg; struct urb *urb; /* create message: */ msg = kzalloc(sizeof(struct message), GFP_ATOMIC); if (msg == NULL) return -ENOMEM; /* create URB: */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) { kfree(msg); return -ENOMEM; } /* set message data: */ msg->line6 = line6; msg->buffer = buffer; msg->size = size; msg->done = 0; /* start sending: */ return line6_send_raw_message_async_part(msg, urb); } EXPORT_SYMBOL_GPL(line6_send_raw_message_async); /* Send asynchronous device version request. */ int line6_version_request_async(struct usb_line6 *line6) { char *buffer; int retval; buffer = kmemdup(line6_request_version, sizeof(line6_request_version), GFP_ATOMIC); if (buffer == NULL) return -ENOMEM; retval = line6_send_raw_message_async(line6, buffer, sizeof(line6_request_version)); kfree(buffer); return retval; } EXPORT_SYMBOL_GPL(line6_version_request_async); /* Send sysex message in pieces of wMaxPacketSize bytes. */ int line6_send_sysex_message(struct usb_line6 *line6, const char *buffer, int size) { return line6_send_raw_message(line6, buffer, size + SYSEX_EXTRA_SIZE) - SYSEX_EXTRA_SIZE; } EXPORT_SYMBOL_GPL(line6_send_sysex_message); /* Allocate buffer for sysex message and prepare header. @param code sysex message code @param size number of bytes between code and sysex end */ char *line6_alloc_sysex_buffer(struct usb_line6 *line6, int code1, int code2, int size) { char *buffer = kmalloc(size + SYSEX_EXTRA_SIZE, GFP_ATOMIC); if (!buffer) return NULL; buffer[0] = LINE6_SYSEX_BEGIN; memcpy(buffer + 1, line6_midi_id, sizeof(line6_midi_id)); buffer[sizeof(line6_midi_id) + 1] = code1; buffer[sizeof(line6_midi_id) + 2] = code2; buffer[sizeof(line6_midi_id) + 3 + size] = LINE6_SYSEX_END; return buffer; } EXPORT_SYMBOL_GPL(line6_alloc_sysex_buffer); /* Notification of data received from the Line 6 device. */ static void line6_data_received(struct urb *urb) { struct usb_line6 *line6 = (struct usb_line6 *)urb->context; struct midi_buffer *mb = &line6->line6midi->midibuf_in; int done; if (urb->status == -ESHUTDOWN) return; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { scoped_guard(spinlock_irqsave, &line6->line6midi->lock) { done = line6_midibuf_write(mb, urb->transfer_buffer, urb->actual_length); if (done < urb->actual_length) { line6_midibuf_ignore(mb, done); dev_dbg(line6->ifcdev, "%d %d buffer overflow - message skipped\n", done, urb->actual_length); } } for (;;) { scoped_guard(spinlock_irqsave, &line6->line6midi->lock) { done = line6_midibuf_read(mb, line6->buffer_message, LINE6_MIDI_MESSAGE_MAXLEN, LINE6_MIDIBUF_READ_RX); } if (done <= 0) break; line6->message_length = done; line6_midi_receive(line6, line6->buffer_message, done); if (line6->process_message) line6->process_message(line6); } } else { line6->buffer_message = urb->transfer_buffer; line6->message_length = urb->actual_length; if (line6->process_message) line6->process_message(line6); line6->buffer_message = NULL; } line6_start_listen(line6); } #define LINE6_READ_WRITE_STATUS_DELAY 2 /* milliseconds */ #define LINE6_READ_WRITE_MAX_RETRIES 50 /* Read data from device. */ int line6_read_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; u8 len; unsigned count; if (address > 0xffff || datalen > 0xff) return -EINVAL; /* query the serial number: */ ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, (datalen << 8) | 0x21, address, NULL, 0, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "read request failed (error %d)\n", ret); goto exit; } /* Wait for data length. We'll get 0xff until length arrives. */ for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, &len, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receive length failed (error %d)\n", ret); goto exit; } if (len != 0xff) break; } ret = -EIO; if (len == 0xff) { dev_err(line6->ifcdev, "read failed after %d retries\n", count); goto exit; } else if (len != datalen) { /* should be equal or something went wrong */ dev_err(line6->ifcdev, "length mismatch (expected %d, got %d)\n", (int)datalen, len); goto exit; } /* receive the result: */ ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0013, 0x0000, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) dev_err(line6->ifcdev, "read failed (error %d)\n", ret); exit: return ret; } EXPORT_SYMBOL_GPL(line6_read_data); /* Write data to device. */ int line6_write_data(struct usb_line6 *line6, unsigned address, void *data, unsigned datalen) { struct usb_device *usbdev = line6->usbdev; int ret; unsigned char *status; int count; if (address > 0xffff || datalen > 0xffff) return -EINVAL; status = kmalloc(1, GFP_KERNEL); if (!status) return -ENOMEM; ret = usb_control_msg_send(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, 0x0022, address, data, datalen, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "write request failed (error %d)\n", ret); goto exit; } for (count = 0; count < LINE6_READ_WRITE_MAX_RETRIES; count++) { mdelay(LINE6_READ_WRITE_STATUS_DELAY); ret = usb_control_msg_recv(usbdev, 0, 0x67, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0012, 0x0000, status, 1, LINE6_TIMEOUT, GFP_KERNEL); if (ret) { dev_err(line6->ifcdev, "receiving status failed (error %d)\n", ret); goto exit; } if (*status != 0xff) break; } if (*status == 0xff) { dev_err(line6->ifcdev, "write failed after %d retries\n", count); ret = -EIO; } else if (*status != 0) { dev_err(line6->ifcdev, "write failed (error %d)\n", ret); ret = -EIO; } exit: kfree(status); return ret; } EXPORT_SYMBOL_GPL(line6_write_data); /* Read Line 6 device serial number. (POD, TonePort, GuitarPort) */ int line6_read_serial_number(struct usb_line6 *line6, u32 *serial_number) { return line6_read_data(line6, 0x80d0, serial_number, sizeof(*serial_number)); } EXPORT_SYMBOL_GPL(line6_read_serial_number); /* Card destructor. */ static void line6_destruct(struct snd_card *card) { struct usb_line6 *line6 = card->private_data; struct usb_device *usbdev = line6->usbdev; /* Free buffer memory first. We cannot depend on the existence of private * data from the (podhd) module, it may be gone already during this call */ kfree(line6->buffer_message); kfree(line6->buffer_listen); /* then free URBs: */ usb_free_urb(line6->urb_listen); line6->urb_listen = NULL; /* decrement reference counters: */ usb_put_dev(usbdev); } static void line6_get_usb_properties(struct usb_line6 *line6) { struct usb_device *usbdev = line6->usbdev; const struct line6_properties *properties = line6->properties; int pipe; struct usb_host_endpoint *ep = NULL; if (properties->capabilities & LINE6_CAP_CONTROL) { if (properties->capabilities & LINE6_CAP_CONTROL_MIDI) { pipe = usb_rcvintpipe(line6->usbdev, line6->properties->ep_ctrl_r); } else { pipe = usb_rcvbulkpipe(line6->usbdev, line6->properties->ep_ctrl_r); } ep = usbdev->ep_in[usb_pipeendpoint(pipe)]; } /* Control data transfer properties */ if (ep) { line6->interval = ep->desc.bInterval; line6->max_packet_size = le16_to_cpu(ep->desc.wMaxPacketSize); } else { if (properties->capabilities & LINE6_CAP_CONTROL) { dev_err(line6->ifcdev, "endpoint not available, using fallback values"); } line6->interval = LINE6_FALLBACK_INTERVAL; line6->max_packet_size = LINE6_FALLBACK_MAXPACKETSIZE; } /* Isochronous transfer properties */ if (usbdev->speed == USB_SPEED_LOW) { line6->intervals_per_second = USB_LOW_INTERVALS_PER_SECOND; line6->iso_buffers = USB_LOW_ISO_BUFFERS; } else { line6->intervals_per_second = USB_HIGH_INTERVALS_PER_SECOND; line6->iso_buffers = USB_HIGH_ISO_BUFFERS; } } /* Enable buffering of incoming messages, flush the buffer */ static int line6_hwdep_open(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; /* NOTE: hwdep layer provides atomicity here */ line6->messages.active = 1; line6->messages.nonblock = file->f_flags & O_NONBLOCK ? 1 : 0; return 0; } /* Stop buffering */ static int line6_hwdep_release(struct snd_hwdep *hw, struct file *file) { struct usb_line6 *line6 = hw->private_data; line6->messages.active = 0; return 0; } /* Read from circular buffer, return to user */ static long line6_hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; long rv = 0; unsigned int out_count; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; while (kfifo_len(&line6->messages.fifo) == 0) { mutex_unlock(&line6->messages.read_lock); if (line6->messages.nonblock) return -EAGAIN; rv = wait_event_interruptible( line6->messages.wait_queue, kfifo_len(&line6->messages.fifo) != 0); if (rv < 0) return rv; if (mutex_lock_interruptible(&line6->messages.read_lock)) return -ERESTARTSYS; } if (kfifo_peek_len(&line6->messages.fifo) > count) { /* Buffer too small; allow re-read of the current item... */ rv = -EINVAL; } else { rv = kfifo_to_user(&line6->messages.fifo, buf, count, &out_count); if (rv == 0) rv = out_count; } mutex_unlock(&line6->messages.read_lock); return rv; } /* Write directly (no buffering) to device by user*/ static long line6_hwdep_write(struct snd_hwdep *hwdep, const char __user *data, long count, loff_t *offset) { struct usb_line6 *line6 = hwdep->private_data; int rv; char *data_copy; if (count > line6->max_packet_size * LINE6_RAW_MESSAGES_MAXCOUNT) { /* This is an arbitrary limit - still better than nothing... */ return -EINVAL; } data_copy = memdup_user(data, count); if (IS_ERR(data_copy)) return PTR_ERR(data_copy); rv = line6_send_raw_message(line6, data_copy, count); kfree(data_copy); return rv; } static __poll_t line6_hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait) { struct usb_line6 *line6 = hwdep->private_data; poll_wait(file, &line6->messages.wait_queue, wait); guard(mutex)(&line6->messages.read_lock); return kfifo_len(&line6->messages.fifo) == 0 ? 0 : EPOLLIN | EPOLLRDNORM; } static const struct snd_hwdep_ops hwdep_ops = { .open = line6_hwdep_open, .release = line6_hwdep_release, .read = line6_hwdep_read, .write = line6_hwdep_write, .poll = line6_hwdep_poll, }; /* Insert into circular buffer */ static void line6_hwdep_push_message(struct usb_line6 *line6) { if (!line6->messages.active) return; if (kfifo_avail(&line6->messages.fifo) >= line6->message_length) { /* No race condition here, there's only one writer */ kfifo_in(&line6->messages.fifo, line6->buffer_message, line6->message_length); } /* else TODO: signal overflow */ wake_up_interruptible(&line6->messages.wait_queue); } static int line6_hwdep_init(struct usb_line6 *line6) { int err; struct snd_hwdep *hwdep; /* TODO: usb_driver_claim_interface(); */ line6->process_message = line6_hwdep_push_message; line6->messages.active = 0; init_waitqueue_head(&line6->messages.wait_queue); mutex_init(&line6->messages.read_lock); INIT_KFIFO(line6->messages.fifo); err = snd_hwdep_new(line6->card, "config", 0, &hwdep); if (err < 0) goto end; strscpy(hwdep->name, "config"); hwdep->iface = SNDRV_HWDEP_IFACE_LINE6; hwdep->ops = hwdep_ops; hwdep->private_data = line6; hwdep->exclusive = true; end: return err; } static int line6_init_cap_control(struct usb_line6 *line6) { int ret; /* initialize USB buffers: */ line6->buffer_listen = kzalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); if (!line6->buffer_listen) return -ENOMEM; line6->urb_listen = usb_alloc_urb(0, GFP_KERNEL); if (!line6->urb_listen) return -ENOMEM; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { line6->buffer_message = kzalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); if (!line6->buffer_message) return -ENOMEM; ret = line6_init_midi(line6); if (ret < 0) return ret; } else { ret = line6_hwdep_init(line6); if (ret < 0) return ret; } ret = line6_start_listen(line6); if (ret < 0) { dev_err(line6->ifcdev, "cannot start listening: %d\n", ret); return ret; } return 0; } static void line6_startup_work(struct work_struct *work) { struct usb_line6 *line6 = container_of(work, struct usb_line6, startup_work.work); if (line6->startup) line6->startup(line6); } /* Probe USB device. */ int line6_probe(struct usb_interface *interface, const struct usb_device_id *id, const char *driver_name, const struct line6_properties *properties, int (*private_init)(struct usb_line6 *, const struct usb_device_id *id), size_t data_size) { struct usb_device *usbdev = interface_to_usbdev(interface); struct snd_card *card; struct usb_line6 *line6; int interface_number; int ret; if (WARN_ON(data_size < sizeof(*line6))) return -EINVAL; /* we don't handle multiple configurations */ if (usbdev->descriptor.bNumConfigurations != 1) return -ENODEV; ret = snd_card_new(&interface->dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1, THIS_MODULE, data_size, &card); if (ret < 0) return ret; /* store basic data: */ line6 = card->private_data; line6->card = card; line6->properties = properties; line6->usbdev = usbdev; line6->ifcdev = &interface->dev; INIT_DELAYED_WORK(&line6->startup_work, line6_startup_work); strscpy(card->id, properties->id); strscpy(card->driver, driver_name); strscpy(card->shortname, properties->name); sprintf(card->longname, "Line 6 %s at USB %s", properties->name, dev_name(line6->ifcdev)); card->private_free = line6_destruct; usb_set_intfdata(interface, line6); /* increment reference counters: */ usb_get_dev(usbdev); /* initialize device info: */ dev_info(&interface->dev, "Line 6 %s found\n", properties->name); /* query interface number */ interface_number = interface->cur_altsetting->desc.bInterfaceNumber; /* TODO reserves the bus bandwidth even without actual transfer */ ret = usb_set_interface(usbdev, interface_number, properties->altsetting); if (ret < 0) { dev_err(&interface->dev, "set_interface failed\n"); goto error; } line6_get_usb_properties(line6); if (properties->capabilities & LINE6_CAP_CONTROL) { ret = line6_init_cap_control(line6); if (ret < 0) goto error; } /* initialize device data based on device: */ ret = private_init(line6, id); if (ret < 0) goto error; /* creation of additional special files should go here */ dev_info(&interface->dev, "Line 6 %s now attached\n", properties->name); return 0; error: /* we can call disconnect callback here because no close-sync is * needed yet at this point */ line6_disconnect(interface); return ret; } EXPORT_SYMBOL_GPL(line6_probe); /* Line 6 device disconnected. */ void line6_disconnect(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct usb_device *usbdev = interface_to_usbdev(interface); if (!line6) return; if (WARN_ON(usbdev != line6->usbdev)) return; cancel_delayed_work_sync(&line6->startup_work); if (line6->urb_listen != NULL) line6_stop_listen(line6); snd_card_disconnect(line6->card); if (line6->line6pcm) line6_pcm_disconnect(line6->line6pcm); if (line6->disconnect) line6->disconnect(line6); dev_info(&interface->dev, "Line 6 %s now disconnected\n", line6->properties->name); /* make sure the device isn't destructed twice: */ usb_set_intfdata(interface, NULL); snd_card_free_when_closed(line6->card); } EXPORT_SYMBOL_GPL(line6_disconnect); #ifdef CONFIG_PM /* Suspend Line 6 device. */ int line6_suspend(struct usb_interface *interface, pm_message_t message) { struct usb_line6 *line6 = usb_get_intfdata(interface); struct snd_line6_pcm *line6pcm = line6->line6pcm; snd_power_change_state(line6->card, SNDRV_CTL_POWER_D3hot); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_stop_listen(line6); if (line6pcm != NULL) line6pcm->flags = 0; return 0; } EXPORT_SYMBOL_GPL(line6_suspend); /* Resume Line 6 device. */ int line6_resume(struct usb_interface *interface) { struct usb_line6 *line6 = usb_get_intfdata(interface); if (line6->properties->capabilities & LINE6_CAP_CONTROL) line6_start_listen(line6); snd_power_change_state(line6->card, SNDRV_CTL_POWER_D0); return 0; } EXPORT_SYMBOL_GPL(line6_resume); #endif /* CONFIG_PM */ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
| 47 10200 21 5923 2182 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif |
| 5 4 4 2 2 2 1 2 2 1 1 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify_backend.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/inotify.h> #include <linux/fanotify.h> #include <linux/kernel.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/seq_file.h> #include <linux/exportfs.h> #include "inotify/inotify.h" #include "fanotify/fanotify.h" #include "fdinfo.h" #include "fsnotify.h" #include "../internal.h" #if defined(CONFIG_PROC_FS) #if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY) static void show_fdinfo(struct seq_file *m, struct file *f, void (*show)(struct seq_file *m, struct fsnotify_mark *mark)) { struct fsnotify_group *group = f->private_data; struct fsnotify_mark *mark; fsnotify_group_lock(group); list_for_each_entry(mark, &group->marks_list, g_list) { show(m, mark); if (seq_has_overflowed(m)) break; } fsnotify_group_unlock(group); } #if defined(CONFIG_EXPORTFS) static void show_mark_fhandle(struct seq_file *m, struct inode *inode) { DEFINE_FLEX(struct file_handle, f, f_handle, handle_bytes, MAX_HANDLE_SZ); int size, ret, i; size = f->handle_bytes >> 2; if (!super_trylock_shared(inode->i_sb)) return; ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); up_read(&inode->i_sb->s_umount); if ((ret == FILEID_INVALID) || (ret < 0)) return; f->handle_type = ret; f->handle_bytes = size * sizeof(u32); seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", f->handle_bytes, f->handle_type); for (i = 0; i < f->handle_bytes; i++) seq_printf(m, "%02x", (int)f->f_handle[i]); } #else static void show_mark_fhandle(struct seq_file *m, struct inode *inode) { } #endif #ifdef CONFIG_INOTIFY_USER static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { struct inotify_inode_mark *inode_mark; struct inode *inode; if (mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); } } void inotify_show_fdinfo(struct seq_file *m, struct file *f) { show_fdinfo(m, f, inotify_fdinfo); } #endif /* CONFIG_INOTIFY_USER */ #ifdef CONFIG_FANOTIFY static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { unsigned int mflags = fanotify_mark_user_flags(mark); struct inode *inode; if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = igrab(fsnotify_conn_inode(mark->connector)); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", sb->s_dev, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) { struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector); seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n", mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask); } } void fanotify_show_fdinfo(struct seq_file *m, struct file *f) { struct fsnotify_group *group = f->private_data; seq_printf(m, "fanotify flags:%x event-flags:%x\n", group->fanotify_data.flags & FANOTIFY_INIT_FLAGS, group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } #endif /* CONFIG_FANOTIFY */ #endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */ #endif /* CONFIG_PROC_FS */ |
| 18 16 1 1 48 49 8 8 49 21 11 49 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } int tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return i; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return i + 1; } |
| 18 12 1 6 6 8 2 13 13 2 10 2 2 5 5 12 12 17 14 2 12 14 14 14 16 15 1 17 17 1 1 1 14 5 5 1 4 11 1 10 9 1 1 1 3 3 1 2 2 2 2 1 1 4 4 1 1 1 1 1 2 6 6 21 1 20 28 28 1 27 1 25 22 1 2 23 2 1 23 3 1 4 3 25 20 5 5 2 25 25 1 26 17 12 22 4 26 26 1 1 25 22 1 3 21 26 26 1 3 4 9 9 1 1 1 3 5 1 7 4 2 1 6 1 4 3 60 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 | // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/in6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip6_net_id; struct l2tp_ip6_net { rwlock_t l2tp_ip6_lock; struct hlist_head l2tp_ip6_table; struct hlist_head l2tp_ip6_bind_table; }; struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; struct ipv6_pinfo inet6; }; static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) { return net_generic(net, l2tp_ip6_net_id); } static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) continue; if (!ipv6_addr_any(sk_raddr) && raddr && !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; pn = l2tp_ip6_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip6_hash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip6_lock); sk_add_node(sk, &pn->l2tp_ip6_table); write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip6_hash(sk); return 0; } static void l2tp_ip6_close(struct sock *sk, long timeout) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; pn = l2tp_ip6_pernet(net); if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) return -EINVAL; addr_type = ipv6_addr_type(&addr->l2tp_addr); /* l2tp_ip6 sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; /* L2TP is point-point, not multicast */ if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out_unlock; if (sk->sk_state != TCP_CLOSE) goto out_unlock; bound_dev_if = sk->sk_bound_dev_if; /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr->l2tp_scope_id) bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an * interface. */ if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0)) goto out_unlock_rcu; } rcu_read_unlock(); write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } inet->inet_saddr = v4addr; inet->inet_rcv_saddr = v4addr; sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); return 0; out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); return err; } static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; if (addr_type & IPV6_ADDR_MAPPED) { daddr = &usin->sin6_addr; if (ipv4_is_multicast(daddr->s6_addr32[3])) return -EINVAL; } lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip6_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip6_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk); lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) lsa->l2tp_addr = np->saddr; else lsa->l2tp_addr = sk->sk_v6_rcv_saddr; lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } static int l2tp_ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; __be32 *transhdr = NULL; int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); *transhdr = 0; err = ip6_push_pending_frames(sk); out: return err; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && lsa->l2tp_scope_id && ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = lsa->l2tp_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: lock_sock(sk); ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = l2tp_ip6_push_pending_frames(sk); release_sock(sk); done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (lsa) { lsa->l2tp_family = AF_INET6; lsa->l2tp_unused = 0; lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); *addr_len = sizeof(*lsa); } if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } static struct proto l2tp_ip6_prot = { .name = "L2TP/IPv6", .owner = THIS_MODULE, .init = l2tp_ip6_open, .close = l2tp_ip6_close, .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw l2tp_ip6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; static __net_init int l2tp_ip6_init_net(struct net *net) { struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); rwlock_init(&pn->l2tp_ip6_lock); INIT_HLIST_HEAD(&pn->l2tp_ip6_table); INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); return 0; } static __net_exit void l2tp_ip6_exit_net(struct net *net) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); write_lock_bh(&pn->l2tp_ip6_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip6_lock); } static struct pernet_operations l2tp_ip6_net_ops = { .init = l2tp_ip6_init_net, .exit = l2tp_ip6_exit_net, .id = &l2tp_ip6_net_id, .size = sizeof(struct l2tp_ip6_net), }; static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip6_net_ops); if (err) goto out; err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; out2: proto_unregister(&l2tp_ip6_prot); out1: unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } static void __exit l2tp_ip6_exit(void) { inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); module_exit(l2tp_ip6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chris Elston <celston@katalix.com>"); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115); |
| 12 11 12 12 12 12 12 10 2 2 2 1 12 1 11 13 13 13 8 5 11 1 13 2 9 14 1 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | // SPDX-License-Identifier: GPL-2.0-or-later /* Direct I/O support. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/sched/mm.h> #include <linux/task_io_accounting_ops.h> #include <linux/netfs.h> #include "internal.h" static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; size_t rsize; rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); subreq->len = rsize; if (unlikely(rreq->io_streams[0].sreq_max_segs)) { size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize, rreq->io_streams[0].sreq_max_segs); if (limit < rsize) { subreq->len = limit; trace_netfs_sreq(subreq, netfs_sreq_trace_limited); } } trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); subreq->io_iter = rreq->buffer.iter; iov_iter_truncate(&subreq->io_iter, subreq->len); iov_iter_advance(&rreq->buffer.iter, subreq->len); } /* * Perform a read to a buffer from the server, slicing up the region to be read * according to the network rsize. */ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; do { struct netfs_io_subrequest *subreq; ssize_t slice; subreq = netfs_alloc_subrequest(rreq); if (!subreq) { ret = -ENOMEM; break; } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = size; __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { stream->front = subreq; if (!stream->active) { stream->collected_to = stream->front->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } } trace_netfs_sreq(subreq, netfs_sreq_trace_added); spin_unlock(&rreq->lock); netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } } netfs_prepare_dio_read_iterator(subreq); slice = subreq->len; size -= slice; start += slice; rreq->submitted += slice; if (size <= 0) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); } rreq->netfs_ops->issue_read(subreq); if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) netfs_wait_for_paused_read(rreq); if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) break; cond_resched(); } while (size > 0); if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); netfs_wake_collector(rreq); } return ret; } /* * Perform a read to an application buffer, bypassing the pagecache and the * local disk cache. */ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) { ssize_t ret; _enter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); netfs_put_request(rreq, netfs_rreq_trace_put_discard); return -EIO; } // TODO: Use bounce buffer if requested inode_dio_begin(rreq->inode); ret = netfs_dispatch_unbuffered_reads(rreq); if (!rreq->submitted) { netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); inode_dio_end(rreq->inode); ret = 0; goto out; } if (sync) ret = netfs_wait_for_read(rreq); else ret = -EIOCBQUEUED; out: _leave(" = %zd", ret); return ret; } /** * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read * @iter: The output buffer (also specifies read length) * * Perform an unbuffered I/O or direct I/O from the file in @iocb to the * output buffer. No use is made of the pagecache. * * The caller must hold any appropriate locks. */ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) { struct netfs_io_request *rreq; ssize_t ret; size_t orig_count = iov_iter_count(iter); bool sync = is_sync_kiocb(iocb); _enter(""); if (!orig_count) return 0; /* Don't update atime */ ret = kiocb_write_and_wait(iocb, orig_count); if (ret < 0) return ret; file_accessed(iocb->ki_filp); rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, orig_count, iocb->ki_flags & IOCB_DIRECT ? NETFS_DIO_READ : NETFS_UNBUFFERED_READ); if (IS_ERR(rreq)) return PTR_ERR(rreq); netfs_stat(&netfs_n_rh_dio_read); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); /* If this is an async op, we have to keep track of the destination * buffer for ourselves as the caller's iterator will be trashed when * we return. * * In such a case, extract an iterator to represent as much of the the * output buffer as we can manage. Note that the extraction might not * be able to allocate a sufficiently large bvec array and may shorten * the request. */ if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) goto error_put; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); rreq->len = iov_iter_count(&rreq->buffer.iter); } else { rreq->buffer.iter = *iter; rreq->len = orig_count; rreq->direct_bv_unpin = false; iov_iter_advance(iter, orig_count); } // TODO: Set up bounce buffer if needed if (!sync) { rreq->iocb = iocb; __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags); } ret = netfs_unbuffered_read(rreq, sync); if (ret < 0) goto out; /* May be -EIOCBQUEUED */ if (sync) { // TODO: Copy from bounce buffer iocb->ki_pos += rreq->transferred; ret = rreq->transferred; } out: netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; return ret; error_put: netfs_put_failed_request(rreq); return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); /** * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read * @iter: The output buffer (also specifies read length) * * Perform an unbuffered I/O or direct I/O from the file in @iocb to the * output buffer. No use is made of the pagecache. */ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (!iter->count) return 0; /* Don't update atime */ ret = netfs_start_io_direct(inode); if (ret == 0) { ret = netfs_unbuffered_read_iter_locked(iocb, iter); netfs_end_io_direct(inode); } return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter); |
| 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 | /* * Copyright (c) 2005 Intel Inc. All rights reserved. * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/slab.h> #include "mad_priv.h" #include "mad_rmpp.h" enum rmpp_state { RMPP_STATE_ACTIVE, RMPP_STATE_TIMEOUT, RMPP_STATE_COMPLETE }; struct mad_rmpp_recv { struct ib_mad_agent_private *agent; struct list_head list; struct delayed_work timeout_work; struct delayed_work cleanup_work; struct completion comp; enum rmpp_state state; spinlock_t lock; refcount_t refcount; struct ib_ah *ah; struct ib_mad_recv_wc *rmpp_wc; struct ib_mad_recv_buf *cur_seg_buf; int last_ack; int seg_num; int newwin; int repwin; __be64 tid; u32 src_qp; u32 slid; u8 mgmt_class; u8 class_version; u8 method; u8 base_version; }; static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { if (refcount_dec_and_test(&rmpp_recv->refcount)) complete(&rmpp_recv->comp); } static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { deref_rmpp_recv(rmpp_recv); wait_for_completion(&rmpp_recv->comp); rdma_destroy_ah(rmpp_recv->ah, RDMA_DESTROY_AH_SLEEPABLE); kfree(rmpp_recv); } void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) { struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { cancel_delayed_work(&rmpp_recv->timeout_work); cancel_delayed_work(&rmpp_recv->cleanup_work); } spin_unlock_irqrestore(&agent->lock, flags); flush_workqueue(agent->qp_info->port_priv->wq); list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, &agent->rmpp_list, list) { list_del(&rmpp_recv->list); if (rmpp_recv->state != RMPP_STATE_COMPLETE) ib_free_recv_mad(rmpp_recv->rmpp_wc); destroy_rmpp_recv(rmpp_recv); } } static void format_ack(struct ib_mad_send_buf *msg, struct ib_rmpp_mad *data, struct mad_rmpp_recv *rmpp_recv) { struct ib_rmpp_mad *ack = msg->mad; unsigned long flags; memcpy(ack, &data->mad_hdr, msg->hdr_len); ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP; ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK; ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); spin_lock_irqsave(&rmpp_recv->lock, flags); rmpp_recv->last_ack = rmpp_recv->seg_num; ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num); ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin); spin_unlock_irqrestore(&rmpp_recv->lock, flags); } static void ack_recv(struct mad_rmpp_recv *rmpp_recv, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; int ret, hdr_len; hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, 0, GFP_KERNEL, IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) return; format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); msg->ah = rmpp_recv->ah; ret = ib_post_send_mad(msg, NULL); if (ret) ib_free_send_mad(msg); } static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; struct ib_ah *ah; int hdr_len; ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) return ERR_CAST(ah); hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, 0, GFP_KERNEL, IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); else { msg->ah = ah; msg->context[0] = ah; } return msg; } static void ack_ds_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; struct ib_rmpp_mad *rmpp_mad; int ret; msg = alloc_response_msg(&agent->agent, recv_wc); if (IS_ERR(msg)) return; rmpp_mad = msg->mad; memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.seg_num = 0; rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1); ret = ib_post_send_mad(msg, NULL); if (ret) { rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(msg); } } void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc) { if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah) rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(mad_send_wc->send_buf); } static void nack_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *recv_wc, u8 rmpp_status) { struct ib_mad_send_buf *msg; struct ib_rmpp_mad *rmpp_mad; int ret; msg = alloc_response_msg(&agent->agent, recv_wc); if (IS_ERR(msg)) return; rmpp_mad = msg->mad; memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION; rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status; rmpp_mad->rmpp_hdr.seg_num = 0; rmpp_mad->rmpp_hdr.paylen_newwin = 0; ret = ib_post_send_mad(msg, NULL); if (ret) { rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(msg); } } static void recv_timeout_handler(struct work_struct *work) { struct mad_rmpp_recv *rmpp_recv = container_of(work, struct mad_rmpp_recv, timeout_work.work); struct ib_mad_recv_wc *rmpp_wc; unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); if (rmpp_recv->state != RMPP_STATE_ACTIVE) { spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); return; } rmpp_recv->state = RMPP_STATE_TIMEOUT; list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); rmpp_wc = rmpp_recv->rmpp_wc; nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L); destroy_rmpp_recv(rmpp_recv); ib_free_recv_mad(rmpp_wc); } static void recv_cleanup_handler(struct work_struct *work) { struct mad_rmpp_recv *rmpp_recv = container_of(work, struct mad_rmpp_recv, cleanup_work.work); unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); destroy_rmpp_recv(rmpp_recv); } static struct mad_rmpp_recv * create_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_hdr *mad_hdr; rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL); if (!rmpp_recv) return NULL; rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, agent->agent.port_num); if (IS_ERR(rmpp_recv->ah)) goto error; rmpp_recv->agent = agent; init_completion(&rmpp_recv->comp); INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler); INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); spin_lock_init(&rmpp_recv->lock); rmpp_recv->state = RMPP_STATE_ACTIVE; refcount_set(&rmpp_recv->refcount, 1); rmpp_recv->rmpp_wc = mad_recv_wc; rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; rmpp_recv->newwin = 1; rmpp_recv->seg_num = 1; rmpp_recv->last_ack = 0; rmpp_recv->repwin = 1; mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; rmpp_recv->tid = mad_hdr->tid; rmpp_recv->src_qp = mad_recv_wc->wc->src_qp; rmpp_recv->slid = mad_recv_wc->wc->slid; rmpp_recv->mgmt_class = mad_hdr->mgmt_class; rmpp_recv->class_version = mad_hdr->class_version; rmpp_recv->method = mad_hdr->method; rmpp_recv->base_version = mad_hdr->base_version; return rmpp_recv; error: kfree(rmpp_recv); return NULL; } static struct mad_rmpp_recv * find_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { if (rmpp_recv->tid == mad_hdr->tid && rmpp_recv->src_qp == mad_recv_wc->wc->src_qp && rmpp_recv->slid == mad_recv_wc->wc->slid && rmpp_recv->mgmt_class == mad_hdr->mgmt_class && rmpp_recv->class_version == mad_hdr->class_version && rmpp_recv->method == mad_hdr->method) return rmpp_recv; } return NULL; } static struct mad_rmpp_recv * acquire_rmpp_recv(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv) refcount_inc(&rmpp_recv->refcount); spin_unlock_irqrestore(&agent->lock, flags); return rmpp_recv; } static struct mad_rmpp_recv * insert_rmpp_recv(struct ib_mad_agent_private *agent, struct mad_rmpp_recv *rmpp_recv) { struct mad_rmpp_recv *cur_rmpp_recv; cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc); if (!cur_rmpp_recv) list_add_tail(&rmpp_recv->list, &agent->rmpp_list); return cur_rmpp_recv; } static inline int get_last_flag(struct ib_mad_recv_buf *seg) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *) seg->mad; return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST; } static inline int get_seg_num(struct ib_mad_recv_buf *seg) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *) seg->mad; return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); } static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list, struct ib_mad_recv_buf *seg) { if (seg->list.next == rmpp_list) return NULL; return container_of(seg->list.next, struct ib_mad_recv_buf, list); } static inline int window_size(struct ib_mad_agent_private *agent) { return max(agent->qp_info->recv_queue.max_active >> 3, 1); } static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list, int seg_num) { struct ib_mad_recv_buf *seg_buf; int cur_seg_num; list_for_each_entry_reverse(seg_buf, rmpp_list, list) { cur_seg_num = get_seg_num(seg_buf); if (seg_num > cur_seg_num) return seg_buf; if (seg_num == cur_seg_num) break; } return NULL; } static void update_seg_num(struct mad_rmpp_recv *rmpp_recv, struct ib_mad_recv_buf *new_buf) { struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list; while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) { rmpp_recv->cur_seg_buf = new_buf; rmpp_recv->seg_num++; new_buf = get_next_seg(rmpp_list, new_buf); } } static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) { struct ib_rmpp_mad *rmpp_mad; int hdr_size, data_size, pad; bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device, rmpp_recv->agent->qp_info->port_priv->port_num); rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) { data_size = sizeof(struct opa_rmpp_mad) - hdr_size; pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); if (pad > OPA_MGMT_RMPP_DATA || pad < 0) pad = 0; } else { data_size = sizeof(struct ib_rmpp_mad) - hdr_size; pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); if (pad > IB_MGMT_RMPP_DATA || pad < 0) pad = 0; } return hdr_size + rmpp_recv->seg_num * data_size - pad; } static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv) { struct ib_mad_recv_wc *rmpp_wc; ack_recv(rmpp_recv, rmpp_recv->rmpp_wc); if (rmpp_recv->seg_num > 1) cancel_delayed_work(&rmpp_recv->timeout_work); rmpp_wc = rmpp_recv->rmpp_wc; rmpp_wc->mad_len = get_mad_len(rmpp_recv); /* 10 seconds until we can find the packet lifetime */ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq, &rmpp_recv->cleanup_work, msecs_to_jiffies(10000)); return rmpp_wc; } static struct ib_mad_recv_wc * continue_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; struct ib_mad_recv_buf *prev_buf; struct ib_mad_recv_wc *done_wc; int seg_num; unsigned long flags; rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc); if (!rmpp_recv) goto drop1; seg_num = get_seg_num(&mad_recv_wc->recv_buf); spin_lock_irqsave(&rmpp_recv->lock, flags); if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) || (seg_num > rmpp_recv->newwin)) goto drop3; if ((seg_num <= rmpp_recv->last_ack) || (rmpp_recv->state == RMPP_STATE_COMPLETE)) { spin_unlock_irqrestore(&rmpp_recv->lock, flags); ack_recv(rmpp_recv, mad_recv_wc); goto drop2; } prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num); if (!prev_buf) goto drop3; done_wc = NULL; list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list); if (rmpp_recv->cur_seg_buf == prev_buf) { update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf); if (get_last_flag(rmpp_recv->cur_seg_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; spin_unlock_irqrestore(&rmpp_recv->lock, flags); done_wc = complete_rmpp(rmpp_recv); goto out; } else if (rmpp_recv->seg_num == rmpp_recv->newwin) { rmpp_recv->newwin += window_size(agent); spin_unlock_irqrestore(&rmpp_recv->lock, flags); ack_recv(rmpp_recv, mad_recv_wc); goto out; } } spin_unlock_irqrestore(&rmpp_recv->lock, flags); out: deref_rmpp_recv(rmpp_recv); return done_wc; drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags); drop2: deref_rmpp_recv(rmpp_recv); drop1: ib_free_recv_mad(mad_recv_wc); return NULL; } static struct ib_mad_recv_wc * start_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct mad_rmpp_recv *rmpp_recv; unsigned long flags; rmpp_recv = create_rmpp_recv(agent, mad_recv_wc); if (!rmpp_recv) { ib_free_recv_mad(mad_recv_wc); return NULL; } spin_lock_irqsave(&agent->lock, flags); if (insert_rmpp_recv(agent, rmpp_recv)) { spin_unlock_irqrestore(&agent->lock, flags); /* duplicate first MAD */ destroy_rmpp_recv(rmpp_recv); return continue_rmpp(agent, mad_recv_wc); } refcount_inc(&rmpp_recv->refcount); if (get_last_flag(&mad_recv_wc->recv_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; spin_unlock_irqrestore(&agent->lock, flags); complete_rmpp(rmpp_recv); } else { spin_unlock_irqrestore(&agent->lock, flags); /* 40 seconds until we can find the packet lifetimes */ queue_delayed_work(agent->qp_info->port_priv->wq, &rmpp_recv->timeout_work, msecs_to_jiffies(40000)); rmpp_recv->newwin += window_size(agent); ack_recv(rmpp_recv, mad_recv_wc); mad_recv_wc = NULL; } deref_rmpp_recv(rmpp_recv); return mad_recv_wc; } static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int timeout; u32 paylen = 0; rmpp_mad = mad_send_wr->send_buf.mad; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num); if (mad_send_wr->seg_num == 1) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; paylen = (mad_send_wr->send_buf.seg_count * mad_send_wr->send_buf.seg_rmpp_size) - mad_send_wr->pad; } if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad; } rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); /* 2 seconds for an ACK until we can find the packet lifetime */ timeout = mad_send_wr->send_buf.timeout_ms; if (!timeout || timeout > 2000) mad_send_wr->timeout = msecs_to_jiffies(2000); return ib_send_mad(mad_send_wr); } static void abort_send(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc wc; unsigned long flags; spin_lock_irqsave(&agent->lock, flags); mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); if (!mad_send_wr) goto out; /* Unmatched send */ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ ib_mark_mad_done(mad_send_wr); if (mad_send_wr->state == IB_MAD_STATE_DONE) { spin_unlock_irqrestore(&agent->lock, flags); wc.status = IB_WC_REM_ABORT_ERR; wc.vendor_err = rmpp_status; wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &wc); return; } spin_unlock_irqrestore(&agent->lock, flags); return; out: spin_unlock_irqrestore(&agent->lock, flags); } static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr, int seg_num) { struct list_head *list; wr->last_ack = seg_num; list = &wr->last_ack_seg->list; list_for_each_entry(wr->last_ack_seg, list, list) if (wr->last_ack_seg->num == seg_num) break; } static void process_ds_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc, int newwin) { struct mad_rmpp_recv *rmpp_recv; rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE) rmpp_recv->repwin = newwin; } static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_mad_send_wr_private *mad_send_wr; struct ib_rmpp_mad *rmpp_mad; unsigned long flags; int seg_num, newwin, ret; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); return; } seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); if (newwin < seg_num) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); return; } spin_lock_irqsave(&agent->lock, flags); mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); if (!mad_send_wr) { if (!seg_num) process_ds_ack(agent, mad_recv_wc, newwin); goto out; /* Unmatched or DS RMPP ACK */ } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) && (mad_send_wr->timeout)) { spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; /* Repeated ACK for DS RMPP transaction */ } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ if (seg_num > mad_send_wr->send_buf.seg_count || seg_num > mad_send_wr->newwin) { spin_unlock_irqrestore(&agent->lock, flags); abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); return; } if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack) goto out; /* Old ACK */ if (seg_num > mad_send_wr->last_ack) { adjust_last_ack(mad_send_wr, seg_num); mad_send_wr->retries_left = mad_send_wr->max_retries; } mad_send_wr->newwin = newwin; if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { /* If no response is expected, the ACK completes the send */ if (!mad_send_wr->send_buf.timeout_ms) { struct ib_mad_send_wc wc; ib_mark_mad_done(mad_send_wr); if (mad_send_wr->state == IB_MAD_STATE_DONE) { spin_unlock_irqrestore(&agent->lock, flags); wc.status = IB_WC_SUCCESS; wc.vendor_err = 0; wc.send_buf = &mad_send_wr->send_buf; ib_mad_complete_send_wr(mad_send_wr, &wc); return; } spin_unlock_irqrestore(&agent->lock, flags); return; } if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) ib_reset_mad_timeout(mad_send_wr, mad_send_wr->send_buf.timeout_ms); spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP && mad_send_wr->seg_num < mad_send_wr->newwin && mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ ret = send_next_seg(mad_send_wr); if (ret) goto out; change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } out: spin_unlock_irqrestore(&agent->lock, flags); } static struct ib_mad_recv_wc * process_rmpp_data(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_hdr *rmpp_hdr; u8 rmpp_status; rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr; if (rmpp_hdr->rmpp_status) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS; goto bad; } if (rmpp_hdr->seg_num == cpu_to_be32(1)) { if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; goto bad; } return start_rmpp(agent, mad_recv_wc); } else { if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) { rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; goto bad; } return continue_rmpp(agent, mad_recv_wc); } bad: nack_recv(agent, mad_recv_wc, rmpp_status); ib_free_recv_mad(mad_recv_wc); return NULL; } static void process_rmpp_stop(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); } else abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); } static void process_rmpp_abort(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN || rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); } else abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); } struct ib_mad_recv_wc * ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_rmpp_mad *rmpp_mad; rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE)) return mad_recv_wc; if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) { abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); goto out; } switch (rmpp_mad->rmpp_hdr.rmpp_type) { case IB_MGMT_RMPP_TYPE_DATA: return process_rmpp_data(agent, mad_recv_wc); case IB_MGMT_RMPP_TYPE_ACK: process_rmpp_ack(agent, mad_recv_wc); break; case IB_MGMT_RMPP_TYPE_STOP: process_rmpp_stop(agent, mad_recv_wc); break; case IB_MGMT_RMPP_TYPE_ABORT: process_rmpp_abort(agent, mad_recv_wc); break; default: abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); break; } out: ib_free_recv_mad(mad_recv_wc); return NULL; } static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv; struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad; struct mad_rmpp_recv *rmpp_recv; struct rdma_ah_attr ah_attr; unsigned long flags; int newwin = 1; if (!(mad_hdr->method & IB_MGMT_METHOD_RESP)) goto out; spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { if (rmpp_recv->tid != mad_hdr->tid || rmpp_recv->mgmt_class != mad_hdr->mgmt_class || rmpp_recv->class_version != mad_hdr->class_version || (rmpp_recv->method & IB_MGMT_METHOD_RESP)) continue; if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) continue; if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) { newwin = rmpp_recv->repwin; break; } } spin_unlock_irqrestore(&agent->lock, flags); out: return newwin; } int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) { mad_send_wr->seg_num = 1; return IB_RMPP_RESULT_INTERNAL; } mad_send_wr->newwin = init_newwin(mad_send_wr); /* We need to wait for the final ACK even if there isn't a response */ ret = send_next_seg(mad_send_wr); if (!ret) return IB_RMPP_RESULT_CONSUMED; return ret; } int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ if (mad_send_wc->status != IB_WC_SUCCESS || mad_send_wr->state == IB_MAD_STATE_CANCELED) return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ if (!mad_send_wr->timeout) return IB_RMPP_RESULT_PROCESSED; /* Response received */ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); return IB_RMPP_RESULT_PROCESSED; /* Send done */ } if (mad_send_wr->seg_num == mad_send_wr->newwin || mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */ ret = send_next_seg(mad_send_wr); if (ret) { mad_send_wc->status = IB_WC_GENERAL_ERR; return IB_RMPP_RESULT_PROCESSED; } return IB_RMPP_RESULT_CONSUMED; } int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; mad_send_wr->seg_num = mad_send_wr->last_ack; mad_send_wr->cur_seg = mad_send_wr->last_ack_seg; ret = send_next_seg(mad_send_wr); if (ret) return IB_RMPP_RESULT_PROCESSED; return IB_RMPP_RESULT_CONSUMED; } |
| 164 170 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2021-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_rtrefcount_btree.h" #include "xfs_refcount.h" #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_rtgroup.h" #include "xfs_rtbitmap.h" #include "xfs_metafile.h" #include "xfs_health.h" static struct kmem_cache *xfs_rtrefcountbt_cur_cache; /* * Realtime Reference Count btree. * * This is a btree used to track the owner(s) of a given extent in the realtime * device. See the comments in xfs_refcount_btree.c for more information. * * This tree is basically the same as the regular refcount btree except that * it's rooted in an inode. */ static struct xfs_btree_cur * xfs_rtrefcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); } STATIC int xfs_rtrefcountbt_get_minrecs( struct xfs_btree_cur *cur, int level) { if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; } return cur->bc_mp->m_rtrefc_mnr[level != 0]; } STATIC int xfs_rtrefcountbt_get_maxrecs( struct xfs_btree_cur *cur, int level) { if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); } return cur->bc_mp->m_rtrefc_mxr[level != 0]; } /* * Calculate number of records in a realtime refcount btree inode root. */ unsigned int xfs_rtrefcountbt_droot_maxrecs( unsigned int blocklen, bool leaf) { blocklen -= sizeof(struct xfs_rtrefcount_root); if (leaf) return blocklen / sizeof(struct xfs_refcount_rec); return blocklen / (2 * sizeof(struct xfs_refcount_key) + sizeof(xfs_rtrefcount_ptr_t)); } /* * Get the maximum records we could store in the on-disk format. * * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but * for the root node this checks the available space in the dinode fork so that * we can resize the in-memory buffer to match it. After a resize to the * maximum size this function returns the same value as * xfs_rtrefcountbt_get_maxrecs for the root node, too. */ STATIC int xfs_rtrefcountbt_get_dmaxrecs( struct xfs_btree_cur *cur, int level) { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_rtrefc_mxr[level != 0]; return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void xfs_rtrefcountbt_init_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { key->refc.rc_startblock = rec->refc.rc_startblock; } STATIC void xfs_rtrefcountbt_init_high_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { __u32 x; x = be32_to_cpu(rec->refc.rc_startblock); x += be32_to_cpu(rec->refc.rc_blockcount) - 1; key->refc.rc_startblock = cpu_to_be32(x); } STATIC void xfs_rtrefcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; uint32_t start; start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } STATIC void xfs_rtrefcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { ptr->l = 0; } STATIC int xfs_rtrefcountbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { const struct xfs_refcount_key *kp = &key->refc; const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; uint32_t start; start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); return cmp_int(be32_to_cpu(kp->rc_startblock), start); } STATIC int xfs_rtrefcountbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->refc.rc_startblock); return cmp_int(be32_to_cpu(k1->refc.rc_startblock), be32_to_cpu(k2->refc.rc_startblock)); } static xfs_failaddr_t xfs_rtrefcountbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_failaddr_t fa; int level; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_has_reflink(mp)) return __this_address; fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; level = be16_to_cpu(block->bb_level); if (level > mp->m_rtrefc_maxlevels) return __this_address; return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); } static void xfs_rtrefcountbt_read_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; if (!xfs_btree_fsblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_rtrefcountbt_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); } static void xfs_rtrefcountbt_write_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; fa = xfs_rtrefcountbt_verify(bp); if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_fsblock_calc_crc(bp); } const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { .name = "xfs_rtrefcountbt", .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, .verify_read = xfs_rtrefcountbt_read_verify, .verify_write = xfs_rtrefcountbt_write_verify, .verify_struct = xfs_rtrefcountbt_verify, }; STATIC int xfs_rtrefcountbt_keys_inorder( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2) { return be32_to_cpu(k1->refc.rc_startblock) < be32_to_cpu(k2->refc.rc_startblock); } STATIC int xfs_rtrefcountbt_recs_inorder( struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2) { return be32_to_cpu(r1->refc.rc_startblock) + be32_to_cpu(r1->refc.rc_blockcount) <= be32_to_cpu(r2->refc.rc_startblock); } STATIC enum xbtree_key_contig xfs_rtrefcountbt_keys_contiguous( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->refc.rc_startblock); return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), be32_to_cpu(key2->refc.rc_startblock)); } static inline void xfs_rtrefcountbt_move_ptrs( struct xfs_mount *mp, struct xfs_btree_block *broot, short old_size, size_t new_size, unsigned int numrecs) { void *dptr; void *sptr; sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); } static struct xfs_btree_block * xfs_rtrefcountbt_broot_realloc( struct xfs_btree_cur *cur, unsigned int new_numrecs) { struct xfs_mount *mp = cur->bc_mp; struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); struct xfs_btree_block *broot; unsigned int new_size; unsigned int old_size = ifp->if_broot_bytes; const unsigned int level = cur->bc_nlevels - 1; new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); /* Handle the nop case quietly. */ if (new_size == old_size) return ifp->if_broot; if (new_size > old_size) { unsigned int old_numrecs; /* * If there wasn't any memory allocated before, just allocate * it now and get out. */ if (old_size == 0) return xfs_broot_realloc(ifp, new_size); /* * If there is already an existing if_broot, then we need to * realloc it and possibly move the node block pointers because * those are not butted up against the btree block header. */ old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); broot = xfs_broot_realloc(ifp, new_size); if (level > 0) xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs); goto out_broot; } /* * We're reducing numrecs. If we're going all the way to zero, just * free the block. */ ASSERT(ifp->if_broot != NULL && old_size > 0); if (new_size == 0) return xfs_broot_realloc(ifp, 0); /* * Shrink the btree root by possibly moving the rtrmapbt pointers, * since they are not butted up against the btree block header. Then * reallocate broot. */ if (level > 0) xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs); broot = xfs_broot_realloc(ifp, new_size); out_broot: ASSERT(xfs_rtrefcount_droot_space(broot) <= xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); return broot; } const struct xfs_btree_ops xfs_rtrefcountbt_ops = { .name = "rtrefcount", .type = XFS_BTREE_TYPE_INODE, .geom_flags = XFS_BTGEO_IROOT_RECORDS, .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), .ptr_len = XFS_BTREE_LONG_PTR_LEN, .lru_refs = XFS_REFC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), .sick_mask = XFS_SICK_RG_REFCNTBT, .dup_cursor = xfs_rtrefcountbt_dup_cursor, .alloc_block = xfs_btree_alloc_metafile_block, .free_block = xfs_btree_free_metafile_block, .get_minrecs = xfs_rtrefcountbt_get_minrecs, .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur, .buf_ops = &xfs_rtrefcountbt_buf_ops, .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys, .keys_inorder = xfs_rtrefcountbt_keys_inorder, .recs_inorder = xfs_rtrefcountbt_recs_inorder, .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, .broot_realloc = xfs_rtrefcountbt_broot_realloc, }; /* Allocate a new rt refcount btree cursor. */ struct xfs_btree_cur * xfs_rtrefcountbt_init_cursor( struct xfs_trans *tp, struct xfs_rtgroup *rtg) { struct xfs_inode *ip = rtg_refcount(rtg); struct xfs_mount *mp = rtg_mount(rtg); struct xfs_btree_cur *cur; xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); cur->bc_ino.ip = ip; cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; cur->bc_group = xfs_group_hold(rtg_group(rtg)); cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); cur->bc_ino.whichfork = XFS_DATA_FORK; return cur; } /* * Install a new rt reverse mapping btree root. Caller is responsible for * invalidating and freeing the old btree blocks. */ void xfs_rtrefcountbt_commit_staged_btree( struct xfs_btree_cur *cur, struct xfs_trans *tp) { struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; struct xfs_ifork *ifp; int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); /* * Free any resources hanging off the real fork, then shallow-copy the * staging fork's contents into the real fork to transfer everything * we just built. */ ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); xfs_idestroy_fork(ifp); memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); } /* Calculate number of records in a realtime refcount btree block. */ static inline unsigned int xfs_rtrefcountbt_block_maxrecs( unsigned int blocklen, bool leaf) { if (leaf) return blocklen / sizeof(struct xfs_refcount_rec); return blocklen / (sizeof(struct xfs_refcount_key) + sizeof(xfs_rtrefcount_ptr_t)); } /* * Calculate number of records in an refcount btree block. */ unsigned int xfs_rtrefcountbt_maxrecs( struct xfs_mount *mp, unsigned int blocklen, bool leaf) { blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); } /* Compute the max possible height for realtime refcount btrees. */ unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void) { unsigned int minrecs[2]; unsigned int blocklen; blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; /* We need at most one record for every block in an rt group. */ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); } int __init xfs_rtrefcountbt_init_cur_cache(void) { xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", xfs_btree_cur_sizeof( xfs_rtrefcountbt_maxlevels_ondisk()), 0, 0, NULL); if (!xfs_rtrefcountbt_cur_cache) return -ENOMEM; return 0; } void xfs_rtrefcountbt_destroy_cur_cache(void) { kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); xfs_rtrefcountbt_cur_cache = NULL; } /* Compute the maximum height of a realtime refcount btree. */ void xfs_rtrefcountbt_compute_maxlevels( struct xfs_mount *mp) { unsigned int d_maxlevels, r_maxlevels; if (!xfs_has_rtreflink(mp)) { mp->m_rtrefc_maxlevels = 0; return; } /* * The realtime refcountbt lives on the data device, which means that * its maximum height is constrained by the size of the data device and * the height required to store one refcount record for each rtextent * in an rt group. */ d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, mp->m_sb.sb_dblocks); r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, mp->m_sb.sb_rgextents); /* Add one level to handle the inode root level. */ mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; } /* Calculate the rtrefcount btree size for some records. */ unsigned long long xfs_rtrefcountbt_calc_size( struct xfs_mount *mp, unsigned long long len) { return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); } /* * Calculate the maximum refcount btree size. */ static unsigned long long xfs_rtrefcountbt_max_size( struct xfs_mount *mp, xfs_rtblock_t rtblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_rtrefc_mxr[0] == 0) return 0; return xfs_rtrefcountbt_calc_size(mp, rtblocks); } /* * Figure out how many blocks to reserve and how many are used by this btree. * We need enough space to hold one record for every rt extent in the rtgroup. */ xfs_filblks_t xfs_rtrefcountbt_calc_reserves( struct xfs_mount *mp) { if (!xfs_has_rtreflink(mp)) return 0; return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); } /* * Convert on-disk form of btree root to in-memory form. */ STATIC void xfs_rtrefcountbt_from_disk( struct xfs_inode *ip, struct xfs_rtrefcount_root *dblock, int dblocklen, struct xfs_btree_block *rblock) { struct xfs_mount *mp = ip->i_mount; struct xfs_refcount_key *fkp; __be64 *fpp; struct xfs_refcount_key *tkp; __be64 *tpp; struct xfs_refcount_rec *frp; struct xfs_refcount_rec *trp; unsigned int numrecs; unsigned int maxrecs; unsigned int rblocklen; rblocklen = xfs_rtrefcount_broot_space(mp, dblock); xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, ip->i_ino); rblock->bb_level = dblock->bb_level; rblock->bb_numrecs = dblock->bb_numrecs; if (be16_to_cpu(rblock->bb_level) > 0) { maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); tkp = xfs_rtrefcount_key_addr(rblock, 1); fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); numrecs = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); memcpy(tpp, fpp, sizeof(*fpp) * numrecs); } else { frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); trp = xfs_rtrefcount_rec_addr(rblock, 1); numrecs = be16_to_cpu(dblock->bb_numrecs); memcpy(trp, frp, sizeof(*frp) * numrecs); } } /* Load a realtime reference count btree root in from disk. */ int xfs_iformat_rtrefcount( struct xfs_inode *ip, struct xfs_dinode *dip) { struct xfs_mount *mp = ip->i_mount; struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); struct xfs_btree_block *broot; unsigned int numrecs; unsigned int level; int dsize; /* * growfs must create the rtrefcount inodes before adding a realtime * volume to the filesystem, so we cannot use the rtrefcount predicate * here. */ if (!xfs_has_reflink(ip->i_mount)) { xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); numrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); if (level > mp->m_rtrefc_maxlevels || xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); if (broot) xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); return 0; } /* * Convert in-memory form of btree root to on-disk form. */ void xfs_rtrefcountbt_to_disk( struct xfs_mount *mp, struct xfs_btree_block *rblock, int rblocklen, struct xfs_rtrefcount_root *dblock, int dblocklen) { struct xfs_refcount_key *fkp; __be64 *fpp; struct xfs_refcount_key *tkp; __be64 *tpp; struct xfs_refcount_rec *frp; struct xfs_refcount_rec *trp; unsigned int maxrecs; unsigned int numrecs; ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; if (be16_to_cpu(rblock->bb_level) > 0) { maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); fkp = xfs_rtrefcount_key_addr(rblock, 1); tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); numrecs = be16_to_cpu(rblock->bb_numrecs); memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); memcpy(tpp, fpp, sizeof(*fpp) * numrecs); } else { frp = xfs_rtrefcount_rec_addr(rblock, 1); trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); numrecs = be16_to_cpu(rblock->bb_numrecs); memcpy(trp, frp, sizeof(*frp) * numrecs); } } /* Flush a realtime reference count btree root out to disk. */ void xfs_iflush_rtrefcount( struct xfs_inode *ip, struct xfs_dinode *dip) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes > 0); ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= xfs_inode_fork_size(ip, XFS_DATA_FORK)); xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); } /* * Create a realtime refcount btree inode. */ int xfs_rtrefcountbt_create( struct xfs_rtgroup *rtg, struct xfs_inode *ip, struct xfs_trans *tp, bool init) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *broot; ifp->if_format = XFS_DINODE_FMT_META_BTREE; ASSERT(ifp->if_broot_bytes == 0); ASSERT(ifp->if_bytes == 0); /* Initialize the empty incore btree root. */ broot = xfs_broot_realloc(ifp, xfs_rtrefcount_broot_space_calc(mp, 0, 0)); if (broot) xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, ip->i_ino); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); return 0; } |
| 1067 1376 1350 491 349 473 327 3 344 1 70 138 223 407 1287 755 755 751 2 718 274 321 734 24 594 567 753 544 758 784 785 756 137 250 3 275 243 243 14 223 266 266 164 233 11 286 286 18 268 715 674 676 28 1588 1588 675 675 1076 1076 1062 1067 1067 1067 4 1 4 71 71 126 32 94 205 204 243 242 183 184 440 438 1081 4 2 3 817 812 814 3 3 5 708 707 709 3 3 206 205 243 243 182 184 440 439 1080 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dev_addr_lists.c - Functions for handling net device lists * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> * * This file contains functions for working with unicast, multicast and device * addresses lists. */ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> #include "dev.h" /* * General list handling functions */ static int __hw_addr_insert(struct netdev_hw_addr_list *list, struct netdev_hw_addr *new, int addr_len) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(new->addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&new->type, &ha->type, sizeof(new->type)); parent = *ins_point; if (diff < 0) ins_point = &parent->rb_left; else if (diff > 0) ins_point = &parent->rb_right; else return -EEXIST; } rb_link_node_rcu(&new->node, parent, ins_point); rb_insert_color(&new->node, &list->tree); return 0; } static struct netdev_hw_addr* __hw_addr_create(const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; alloc_size = sizeof(*ha); if (alloc_size < L1_CACHE_BYTES) alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, int sync_count, bool exclusive) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); parent = *ins_point; if (diff < 0) { ins_point = &parent->rb_left; } else if (diff > 0) { ins_point = &parent->rb_right; } else { if (exclusive) return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) return 0; else ha->global_use = true; } if (sync) { if (ha->synced && sync_count) return -EEXIST; else ha->synced++; } ha->refcount++; return 0; } } ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); if (!ha) return -ENOMEM; rb_link_node(&ha->node, parent, ins_point); rb_insert_color(&ha->node, &list->tree); list_add_tail_rcu(&ha->list, &list->list); list->count++; return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, struct netdev_hw_addr *ha, bool global, bool sync) { if (global && !ha->global_use) return -ENOENT; if (sync && !ha->synced) return -ENOENT; if (global) ha->global_use = false; if (sync) ha->synced--; if (--ha->refcount) return 0; rb_erase(&ha->node, &list->tree); list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { struct rb_node *node; node = list->tree.rb_node; while (node) { struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); int diff = memcmp(addr, ha->addr, addr_len); if (diff == 0 && addr_type) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); if (diff < 0) node = node->rb_left; else if (diff > 0) node = node->rb_right; else return ha; } return NULL; } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); if (!ha) return -ENOENT; return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); } static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; if (!err) { ha->sync_cnt++; ha->refcount++; } return 0; } static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, false, true); if (err) return; ha->sync_cnt--; /* address on from list is not marked synced */ __hw_addr_del_entry(from_list, ha, false, false); } int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt == ha->refcount) { __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } else { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } } return err; } EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (!ha->sync_cnt) { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } else if (ha->refcount == 1) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } return err; } EXPORT_SYMBOL(__hw_addr_sync); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } } EXPORT_SYMBOL(__hw_addr_unsync); /** * __hw_addr_sync_dev - Synchronize device's multicast list * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without * any notification to the device. **/ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; int err; /* first go through and flush out any stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt || ha->refcount != 1) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (ha->sync_cnt) continue; err = sync(dev, ha->addr); if (err) return err; ha->sync_cnt++; ha->refcount++; } return 0; } EXPORT_SYMBOL(__hw_addr_sync_dev); /** * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking * into account references * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address or reference on it should be added * @unsync: function to call if address or some reference on it should removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address or references on it * add/remove notifications. The unsync function may be NULL in which case * the addresses or references on it requiring removal will simply be * removed without any notification to the device. That is responsibility of * the driver to identify and distribute address or references on it between * internal address tables. **/ int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *, int), int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; int err, ref_cnt; /* first go through and flush out any unsynced/stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address is not used */ if ((ha->sync_cnt << 1) <= ha->refcount) continue; /* if fails defer unsyncing address */ ref_cnt = ha->refcount - ha->sync_cnt; if (unsync && unsync(dev, ha->addr, ref_cnt)) continue; ha->refcount = (ref_cnt << 1) + 1; ha->sync_cnt = ref_cnt; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync updated/new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address added or reused */ if ((ha->sync_cnt << 1) >= ha->refcount) continue; ref_cnt = ha->refcount - ha->sync_cnt; err = sync(dev, ha->addr, ref_cnt); if (err) return err; ha->refcount = ref_cnt << 1; ha->sync_cnt = ref_cnt; } return 0; } EXPORT_SYMBOL(__hw_addr_ref_sync_dev); /** * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on * it from device * @list: address list to remove synchronized addresses (references on it) from * @dev: device to sync * @unsync: function to call if address and references on it should be removed * * Remove all addresses that were added to the device by * __hw_addr_ref_sync_dev(). This function is intended to be called from the * ndo_stop or ndo_open functions on devices that require explicit address (or * references on it) add/remove notifications. If the unsync function pointer * is NULL then this function can be used to just reset the sync_cnt for the * addresses in the list. **/ void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) continue; ha->refcount -= ha->sync_cnt - 1; ha->sync_cnt = 0; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); /** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by __hw_addr_sync_dev(). * This function is intended to be called from the ndo_stop or ndo_open * functions on devices that require explicit address add/remove * notifications. If the unsync function pointer is NULL then this function * can be used to just reset the sync_cnt for the addresses in the list. **/ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_unsync_dev); static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); } list->count = 0; } void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); /* * Device addresses handling functions */ /* Check that netdev->dev_addr is not written to directly as this would * break the rbtree layout. All changes should go thru dev_addr_set() and co. * Remove this check in mid-2024. */ void dev_addr_check(struct net_device *dev) { if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) return; netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); netdev_warn(dev, "Expected addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr_shadow); netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); } /** * dev_addr_flush - Flush device address list * @dev: device * * Flush device address list and reset ->dev_addr. * * The caller must hold the rtnl_mutex. */ void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } /** * dev_addr_init - Init device address list * @dev: device * * Init device address list and create the first element, * used by ->dev_addr. * * The caller must hold the rtnl_mutex. */ int dev_addr_init(struct net_device *dev) { unsigned char addr[MAX_ADDR_LEN]; struct netdev_hw_addr *ha; int err; /* rtnl_mutex must be held here */ __hw_addr_init(&dev->dev_addrs); memset(addr, 0, sizeof(addr)); err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* * Get the first (previously created) address from the list * and set dev_addr pointer to this location. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); dev->dev_addr = ha->addr; } return err; } void dev_addr_mod(struct net_device *dev, unsigned int offset, const void *addr, size_t len) { struct netdev_hw_addr *ha; dev_addr_check(dev); ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); rb_erase(&ha->node, &dev->dev_addrs.tree); memcpy(&ha->addr[offset], addr, len); memcpy(&dev->dev_addr_shadow[offset], addr, len); WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); } EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address * @dev: device * @addr: address to add * @addr_type: address type * * Add a device address to the device or increase the reference count if * it already exists. * * The caller must hold the rtnl_mutex. */ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; ASSERT_RTNL(); err = netif_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_add); /** * dev_addr_del - Release a device address. * @dev: device * @addr: address to delete * @addr_type: address type * * Release reference to a device address and remove it from the device * if the reference count drops to zero. * * The caller must hold the rtnl_mutex. */ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; struct netdev_hw_addr *ha; ASSERT_RTNL(); /* * We can not remove the first address from the list because * dev->dev_addr points to that. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); if (!memcmp(ha->addr, addr, dev->addr_len) && ha->type == addr_type && ha->refcount == 1) return -ENOENT; err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_del); /* * Unicast list handling functions */ /** * dev_uc_add_excl - Add a global secondary unicast address * @dev: device * @addr: address to add */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add_excl); /** * dev_uc_add - Add a secondary unicast address * @dev: device * @addr: address to add * * Add a secondary unicast address to the device or increase * the reference count if it already exists. */ int dev_uc_add(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add); /** * dev_uc_del - Release secondary unicast address. * @dev: device * @addr: address to delete * * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. */ int dev_uc_del(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_del); /** * dev_uc_sync - Synchronize device's unicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. This function assumes that * addresses will only ever be synced to the @to devices and no other. */ int dev_uc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync); /** * dev_uc_sync_multiple - Synchronize device's unicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have been deleted from the source. The source device * must be locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. It allows for a single source * device to be synced to multiple destination devices. */ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync_multiple); /** * dev_uc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_uc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_uc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two * reasons: * 1) This is always called without any addr_list_lock, so as the * outermost one here, it must be 0. * 2) This is called by some callers after unlinking the upper device, * so the dev->lower_level becomes 1 again. * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or * larger. */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_uc_unsync); /** * dev_uc_flush - Flush unicast addresses * @dev: device * * Flush unicast addresses. */ void dev_uc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->uc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_uc_flush); /** * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. */ void dev_uc_init(struct net_device *dev) { __hw_addr_init(&dev->uc); } EXPORT_SYMBOL(dev_uc_init); /* * Multicast list handling functions */ /** * dev_mc_add_excl - Add a global secondary multicast address * @dev: device * @addr: address to add */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_mc_add_excl); static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false, 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_add - Add a multicast address * @dev: device * @addr: address to add * * Add a multicast address to the device or increase * the reference count if it already exists. */ int dev_mc_add(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, false); } EXPORT_SYMBOL(dev_mc_add); /** * dev_mc_add_global - Add a global multicast address * @dev: device * @addr: address to add * * Add a global multicast address to the device. */ int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, true); } EXPORT_SYMBOL(dev_mc_add_global); static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_del - Delete a multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, false); } EXPORT_SYMBOL(dev_mc_del); /** * dev_mc_del_global - Delete a global multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, true); } EXPORT_SYMBOL(dev_mc_del_global); /** * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. */ int dev_mc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync); /** * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. It allows for a single * source device to be synced to multiple destination devices. */ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync_multiple); /** * dev_mc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_mc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_mc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_mc_unsync); /** * dev_mc_flush - Flush multicast addresses * @dev: device * * Flush multicast addresses. */ void dev_mc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->mc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_mc_flush); /** * dev_mc_init - Init multicast address list * @dev: device * * Init multicast address list. */ void dev_mc_init(struct net_device *dev) { __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init); |
| 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> */ #include <linux/module.h> #include <linux/ipc.h> #include <linux/nsproxy.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/ipc_namespace.h> #include <linux/msg.h> #include <linux/slab.h> #include <linux/cred.h> #include "util.h" static int proc_ipc_dointvec_minmax_orphans(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, shm_rmid_forced); int err; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; if (ns->shm_rmid_forced) shm_destroy_orphaned(ns); return err; } static int proc_ipc_auto_msgmni(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = &dummy; if (write) pr_info_once("writing to auto_msgmni has no effect"); return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } static int proc_ipc_sem_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; semmni = ns->sem_ctls[3]; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. */ if (ret) ns->sem_ctls[3] = semmni; return ret; } int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; static const struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "shm_rmid_forced", .data = &init_ipc_ns.shm_rmid_forced, .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "auto_msgmni", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof(int), .mode = 0644, .proc_handler = proc_ipc_sem_dointvec, }, #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, #endif }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) { return ¤t->nsproxy->ipc_ns->ipc_set; } static int set_is_seen(struct ctl_table_set *set) { return ¤t->nsproxy->ipc_ns->ipc_set == set; } static void ipc_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; #ifdef CONFIG_CHECKPOINT_RESTORE struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || (table->data == &ns->ids[IPC_MSG_IDS].next_id) || (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && checkpoint_restore_ns_capable_noaudit(ns->user_ns)) mode = 0666; else #endif { kuid_t ns_root_uid; kgid_t ns_root_gid; ipc_set_ownership(head, &ns_root_uid, &ns_root_gid); if (uid_eq(current_euid(), ns_root_uid)) mode >>= 6; else if (in_egroup_p(ns_root_gid)) mode >>= 3; } mode &= 7; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = ipc_permissions, .set_ownership = ipc_set_ownership, }; bool setup_ipc_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { if (tbl[i].data == &init_ipc_ns.shm_ctlmax) tbl[i].data = &ns->shm_ctlmax; else if (tbl[i].data == &init_ipc_ns.shm_ctlall) tbl[i].data = &ns->shm_ctlall; else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) tbl[i].data = &ns->shm_ctlmni; else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) tbl[i].data = &ns->shm_rmid_forced; else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) tbl[i].data = &ns->msg_ctlmax; else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) tbl[i].data = &ns->msg_ctlmni; else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) tbl[i].data = &ns->msg_ctlmnb; else if (tbl[i].data == &init_ipc_ns.sem_ctls) tbl[i].data = &ns->sem_ctls; #ifdef CONFIG_CHECKPOINT_RESTORE else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; #endif else tbl[i].data = NULL; } ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl, ARRAY_SIZE(ipc_sysctls)); } if (!ns->ipc_sysctls) { kfree(tbl); retire_sysctl_set(&ns->ipc_set); return false; } return true; } void retire_ipc_sysctls(struct ipc_namespace *ns) { const struct ctl_table *tbl; tbl = ns->ipc_sysctls->ctl_table_arg; unregister_sysctl_table(ns->ipc_sysctls); retire_sysctl_set(&ns->ipc_set); kfree(tbl); } static int __init ipc_sysctl_init(void) { if (!setup_ipc_sysctls(&init_ipc_ns)) { pr_warn("ipc sysctl registration failed\n"); return -ENOMEM; } return 0; } device_initcall(ipc_sysctl_init); static int __init ipc_mni_extend(char *str) { ipc_mni = IPCMNI_EXTEND; ipc_mni_shift = IPCMNI_EXTEND_SHIFT; ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; pr_info("IPCMNI extended to %d.\n", ipc_mni); return 0; } early_param("ipcmni_extend", ipc_mni_extend); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_EXTFREE_ITEM_H__ #define __XFS_EXTFREE_ITEM_H__ /* kernel only EFI/EFD definitions */ struct xfs_mount; struct kmem_cache; /* * Max number of extents in fast allocation path. */ #define XFS_EFI_MAX_FAST_EXTENTS 16 /* * This is the "extent free intention" log item. It is used to log the fact * that some extents need to be free. It is used in conjunction with the * "extent free done" log item described below. * * The EFI is reference counted so that it is not freed prior to both the EFI * and EFD being committed and unpinned. This ensures the EFI is inserted into * the AIL even in the event of out of order EFI/EFD processing. In other words, * an EFI is born with two references: * * 1.) an EFI held reference to track EFI AIL insertion * 2.) an EFD held reference to track EFD commit * * On allocation, both references are the responsibility of the caller. Once the * EFI is added to and dirtied in a transaction, ownership of reference one * transfers to the transaction. The reference is dropped once the EFI is * inserted to the AIL or in the event of failure along the way (e.g., commit * failure, log I/O error, etc.). Note that the caller remains responsible for * the EFD reference under all circumstances to this point. The caller has no * means to detect failure once the transaction is committed, however. * Therefore, an EFD is required after this point, even in the event of * unrelated failure. * * Once an EFD is allocated and dirtied in a transaction, reference two * transfers to the transaction. The EFD reference is dropped once it reaches * the unpin handler. Similar to the EFI, the reference also drops in the event * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; struct xfs_efi_log_format efi_format; }; static inline size_t xfs_efi_log_item_sizeof( unsigned int nr) { return offsetof(struct xfs_efi_log_item, efi_format) + xfs_efi_log_format_sizeof(nr); } /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item * have been freed. */ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; struct xfs_efd_log_format efd_format; }; static inline size_t xfs_efd_log_item_sizeof( unsigned int nr) { return offsetof(struct xfs_efd_log_item, efd_format) + xfs_efd_log_format_sizeof(nr); } /* * Max number of extents in fast allocation path. */ #define XFS_EFD_MAX_FAST_EXTENTS 16 extern struct kmem_cache *xfs_efi_cache; extern struct kmem_cache *xfs_efd_cache; struct xfs_extent_free_item; void xfs_extent_free_defer_add(struct xfs_trans *tp, struct xfs_extent_free_item *xefi, struct xfs_defer_pending **dfpp); unsigned int xfs_efi_log_space(unsigned int nr); unsigned int xfs_efd_log_space(unsigned int nr); #endif /* __XFS_EXTFREE_ITEM_H__ */ |
| 58 23 22 36 22 22 23 42 42 42 42 7 6 7 8 8 8 8 8 8 8 8 8 8 6 6 6 6 6 6 8 2 6 13 13 17 22 4 20 20 20 20 20 20 20 20 20 20 5 5 5 5 5 5 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 1 17 16 6 1 1 1 1 1 1 1 2 45 45 45 29 16 45 20 5 5 5 5 5 5 5 20 20 6 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 | /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> #include <net/addrconf.h> #include <linux/security.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> #include <rdma/rw.h> #include <rdma/lag.h> #include "core_priv.h" #include <trace/events/rdma_core.h> static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); static const char * const ib_events[] = { [IB_EVENT_CQ_ERR] = "CQ error", [IB_EVENT_QP_FATAL] = "QP fatal error", [IB_EVENT_QP_REQ_ERR] = "QP request error", [IB_EVENT_QP_ACCESS_ERR] = "QP access error", [IB_EVENT_COMM_EST] = "communication established", [IB_EVENT_SQ_DRAINED] = "send queue drained", [IB_EVENT_PATH_MIG] = "path migration successful", [IB_EVENT_PATH_MIG_ERR] = "path migration error", [IB_EVENT_DEVICE_FATAL] = "device fatal error", [IB_EVENT_PORT_ACTIVE] = "port active", [IB_EVENT_PORT_ERR] = "port error", [IB_EVENT_LID_CHANGE] = "LID change", [IB_EVENT_PKEY_CHANGE] = "P_key change", [IB_EVENT_SM_CHANGE] = "SM change", [IB_EVENT_SRQ_ERR] = "SRQ error", [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", [IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change" }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) { size_t index = event; return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? ib_events[index] : "unrecognized event"; } EXPORT_SYMBOL(ib_event_msg); static const char * const wc_statuses[] = { [IB_WC_SUCCESS] = "success", [IB_WC_LOC_LEN_ERR] = "local length error", [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", [IB_WC_LOC_PROT_ERR] = "local protection error", [IB_WC_WR_FLUSH_ERR] = "WR flushed", [IB_WC_MW_BIND_ERR] = "memory bind operation error", [IB_WC_BAD_RESP_ERR] = "bad response error", [IB_WC_LOC_ACCESS_ERR] = "local access error", [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error", [IB_WC_REM_ACCESS_ERR] = "remote access error", [IB_WC_REM_OP_ERR] = "remote operation error", [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", [IB_WC_REM_ABORT_ERR] = "operation aborted", [IB_WC_INV_EECN_ERR] = "invalid EE context number", [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", [IB_WC_FATAL_ERR] = "fatal error", [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", [IB_WC_GENERAL_ERR] = "general error", }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) { size_t index = status; return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? wc_statuses[index] : "unrecognized status"; } EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; case IB_RATE_5_GBPS: return 2; case IB_RATE_10_GBPS: return 4; case IB_RATE_20_GBPS: return 8; case IB_RATE_30_GBPS: return 12; case IB_RATE_40_GBPS: return 16; case IB_RATE_60_GBPS: return 24; case IB_RATE_80_GBPS: return 32; case IB_RATE_120_GBPS: return 48; case IB_RATE_14_GBPS: return 6; case IB_RATE_56_GBPS: return 22; case IB_RATE_112_GBPS: return 45; case IB_RATE_168_GBPS: return 67; case IB_RATE_25_GBPS: return 10; case IB_RATE_100_GBPS: return 40; case IB_RATE_200_GBPS: return 80; case IB_RATE_300_GBPS: return 120; case IB_RATE_28_GBPS: return 11; case IB_RATE_50_GBPS: return 20; case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; case IB_RATE_800_GBPS: return 320; case IB_RATE_1600_GBPS: return 640; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; case 2: return IB_RATE_5_GBPS; case 4: return IB_RATE_10_GBPS; case 8: return IB_RATE_20_GBPS; case 12: return IB_RATE_30_GBPS; case 16: return IB_RATE_40_GBPS; case 24: return IB_RATE_60_GBPS; case 32: return IB_RATE_80_GBPS; case 48: return IB_RATE_120_GBPS; case 6: return IB_RATE_14_GBPS; case 22: return IB_RATE_56_GBPS; case 45: return IB_RATE_112_GBPS; case 67: return IB_RATE_168_GBPS; case 10: return IB_RATE_25_GBPS; case 40: return IB_RATE_100_GBPS; case 80: return IB_RATE_200_GBPS; case 120: return IB_RATE_300_GBPS; case 11: return IB_RATE_28_GBPS; case 20: return IB_RATE_50_GBPS; case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; case 320: return IB_RATE_800_GBPS; case 640: return IB_RATE_1600_GBPS; default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; case IB_RATE_5_GBPS: return 5000; case IB_RATE_10_GBPS: return 10000; case IB_RATE_20_GBPS: return 20000; case IB_RATE_30_GBPS: return 30000; case IB_RATE_40_GBPS: return 40000; case IB_RATE_60_GBPS: return 60000; case IB_RATE_80_GBPS: return 80000; case IB_RATE_120_GBPS: return 120000; case IB_RATE_14_GBPS: return 14062; case IB_RATE_56_GBPS: return 56250; case IB_RATE_112_GBPS: return 112500; case IB_RATE_168_GBPS: return 168750; case IB_RATE_25_GBPS: return 25781; case IB_RATE_100_GBPS: return 103125; case IB_RATE_200_GBPS: return 206250; case IB_RATE_300_GBPS: return 309375; case IB_RATE_28_GBPS: return 28125; case IB_RATE_50_GBPS: return 53125; case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; case IB_RATE_800_GBPS: return 850000; case IB_RATE_1600_GBPS: return 1700000; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mbps); struct ib_speed_attr { const char *str; int speed; }; #define IB_SPEED_ATTR(speed_type, _str, _speed) \ [speed_type] = {.str = _str, .speed = _speed} static const struct ib_speed_attr ib_speed_attrs[] = { IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25), IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50), IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100), IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100), IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140), IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250), IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500), IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000), IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000), }; int ib_port_attr_to_speed_info(struct ib_port_attr *attr, struct ib_port_speed_info *speed_info) { int speed_idx = attr->active_speed; switch (attr->active_speed) { case IB_SPEED_DDR: case IB_SPEED_QDR: case IB_SPEED_FDR10: case IB_SPEED_FDR: case IB_SPEED_EDR: case IB_SPEED_HDR: case IB_SPEED_NDR: case IB_SPEED_XDR: case IB_SPEED_SDR: break; default: speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */ break; } speed_info->str = ib_speed_attrs[speed_idx].str; speed_info->rate = ib_speed_attrs[speed_idx].speed; speed_info->rate *= ib_width_enum_to_int(attr->active_width); if (speed_info->rate < 0) return -EINVAL; return 0; } EXPORT_SYMBOL(ib_port_attr_to_speed_info); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; if (node_type == RDMA_NODE_RNIC) return RDMA_TRANSPORT_IWARP; if (node_type == RDMA_NODE_UNSPECIFIED) return RDMA_TRANSPORT_UNSPECIFIED; return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num) { enum rdma_transport_type lt; if (device->ops.get_link_layer) return device->ops.get_link_layer(device, port_num); lt = rdma_node_get_transport(device->node_type); if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ /** * __ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * @caller: caller's build-time module name * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller) { struct ib_pd *pd; int mr_access_flags = 0; int ret; pd = rdma_zalloc_drv_obj(device, ib_pd); if (!pd) return ERR_PTR(-ENOMEM); pd->device = device; pd->flags = flags; rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); rdma_restrack_set_name(&pd->res, caller); ret = device->ops.alloc_pd(pd, NULL); if (ret) { rdma_restrack_put(&pd->res); kfree(pd); return ERR_PTR(ret); } rdma_restrack_add(&pd->res); if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { pr_warn("%s: enabling unsafe global rkey\n", caller); mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } if (mr_access_flags) { struct ib_mr *mr; mr = pd->device->ops.get_dma_mr(pd, mr_access_flags); if (IS_ERR(mr)) { ib_dealloc_pd(pd); return ERR_CAST(mr); } mr->device = pd->device; mr->pd = pd; mr->type = IB_MR_TYPE_DMA; mr->uobject = NULL; mr->need_inval = false; pd->__internal_mr = mr; if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) pd->unsafe_global_rkey = pd->__internal_mr->rkey; } return pd; } EXPORT_SYMBOL(__ib_alloc_pd); /** * ib_dealloc_pd_user - Deallocates a protection domain. * @pd: The protection domain to deallocate. * @udata: Valid user data or NULL for kernel object * * It is an error to call this function while any resources in the pd still * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; if (pd->__internal_mr) { ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL); WARN_ON(ret); pd->__internal_mr = NULL; } ret = pd->device->ops.dealloc_pd(pd, udata); if (ret) return ret; rdma_restrack_del(&pd->res); kfree(pd); return ret; } EXPORT_SYMBOL(ib_dealloc_pd_user); /* Address handles */ /** * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination. * @dest: Pointer to destination ah_attr. Contents of the destination * pointer is assumed to be invalid and attribute are overwritten. * @src: Pointer to source ah_attr. */ void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src) { *dest = *src; if (dest->grh.sgid_attr) rdma_hold_gid_attr(dest->grh.sgid_attr); } EXPORT_SYMBOL(rdma_copy_ah_attr); /** * rdma_replace_ah_attr - Replace valid ah_attr with new one. * @old: Pointer to existing ah_attr which needs to be replaced. * old is assumed to be valid or zero'd * @new: Pointer to the new ah_attr. * * rdma_replace_ah_attr() first releases any reference in the old ah_attr if * old the ah_attr is valid; after that it copies the new attribute and holds * the reference to the replaced ah_attr. */ void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new) { rdma_destroy_ah_attr(old); *old = *new; if (old->grh.sgid_attr) rdma_hold_gid_attr(old->grh.sgid_attr); } EXPORT_SYMBOL(rdma_replace_ah_attr); /** * rdma_move_ah_attr - Move ah_attr pointed by source to destination. * @dest: Pointer to destination ah_attr to copy to. * dest is assumed to be valid or zero'd * @src: Pointer to the new ah_attr. * * rdma_move_ah_attr() first releases any reference in the destination ah_attr * if it is valid. This also transfers ownership of internal references from * src to dest, making src invalid in the process. No new reference of the src * ah_attr is taken. */ void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src) { rdma_destroy_ah_attr(dest); *dest = *src; src->grh.sgid_attr = NULL; } EXPORT_SYMBOL(rdma_move_ah_attr); /* * Validate that the rdma_ah_attr is valid for the device before passing it * off to the driver. */ static int rdma_check_ah_attr(struct ib_device *device, struct rdma_ah_attr *ah_attr) { if (!rdma_is_port_valid(device, ah_attr->port_num)) return -EINVAL; if ((rdma_is_grh_required(device, ah_attr->port_num) || ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) && !(ah_attr->ah_flags & IB_AH_GRH)) return -EINVAL; if (ah_attr->grh.sgid_attr) { /* * Make sure the passed sgid_attr is consistent with the * parameters */ if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index || ah_attr->grh.sgid_attr->port_num != ah_attr->port_num) return -EINVAL; } return 0; } /* * If the ah requires a GRH then ensure that sgid_attr pointer is filled in. * On success the caller is responsible to call rdma_unfill_sgid_attr(). */ static int rdma_fill_sgid_attr(struct ib_device *device, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr **old_sgid_attr) { const struct ib_gid_attr *sgid_attr; struct ib_global_route *grh; int ret; *old_sgid_attr = ah_attr->grh.sgid_attr; ret = rdma_check_ah_attr(device, ah_attr); if (ret) return ret; if (!(ah_attr->ah_flags & IB_AH_GRH)) return 0; grh = rdma_ah_retrieve_grh(ah_attr); if (grh->sgid_attr) return 0; sgid_attr = rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); /* Move ownerhip of the kref into the ah_attr */ grh->sgid_attr = sgid_attr; return 0; } static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *old_sgid_attr) { /* * Fill didn't change anything, the caller retains ownership of * whatever it passed */ if (ah_attr->grh.sgid_attr == old_sgid_attr) return; /* * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller * doesn't see any change in the rdma_ah_attr. If we get here * old_sgid_attr is NULL. */ rdma_destroy_ah_attr(ah_attr); } static const struct ib_gid_attr * rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *old_attr) { if (old_attr) rdma_put_gid_attr(old_attr); if (ah_attr->ah_flags & IB_AH_GRH) { rdma_hold_gid_attr(ah_attr->grh.sgid_attr); return ah_attr->grh.sgid_attr; } return NULL; } static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata, struct net_device *xmit_slave) { struct rdma_ah_init_attr init_attr = {}; struct ib_device *device = pd->device; struct ib_ah *ah; int ret; might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); if (!udata && !device->ops.create_ah) return ERR_PTR(-EOPNOTSUPP); ah = rdma_zalloc_drv_obj_gfp( device, ib_ah, (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); ah->device = device; ah->pd = pd; ah->type = ah_attr->type; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); init_attr.ah_attr = ah_attr; init_attr.flags = flags; init_attr.xmit_slave = xmit_slave; if (udata) ret = device->ops.create_user_ah(ah, &init_attr, udata); else ret = device->ops.create_ah(ah, &init_attr, NULL); if (ret) { if (ah->sgid_attr) rdma_put_gid_attr(ah->sgid_attr); kfree(ah); return ERR_PTR(ret); } atomic_inc(&pd->usecnt); return ah; } /** * rdma_create_ah - Creates an address handle for the * given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags) { const struct ib_gid_attr *old_sgid_attr; struct net_device *slave; struct ib_ah *ah; int ret; ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (ret) return ERR_PTR(ret); slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); if (IS_ERR(slave)) { rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ERR_CAST(slave); } ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); rdma_lag_put_ah_roce_slave(slave); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } EXPORT_SYMBOL(rdma_create_ah); /** * rdma_create_user_ah - Creates an address handle for the * given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata) { const struct ib_gid_attr *old_sgid_attr; struct ib_ah *ah; int err; err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (err) return ERR_PTR(err); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { err = ib_resolve_eth_dmac(pd->device, ah_attr); if (err) { ah = ERR_PTR(err); goto out; } } ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata, NULL); out: rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } EXPORT_SYMBOL(rdma_create_user_ah); int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) { const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; struct iphdr ip4h_checked; const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; /* If it's IPv6, the version must be 6, otherwise, the first * 20 bytes (before the IPv4 header) are garbled. */ if (ip6h->version != 6) return (ip4h->version == 4) ? 4 : 0; /* version may be 6 or 4 because the first 20 bytes could be garbled */ /* RoCE v2 requires no options, thus header length * must be 5 words */ if (ip4h->ihl != 5) return 6; /* Verify checksum. * We can't write on scattered buffers so we need to copy to * temp buffer. */ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); ip4h_checked.check = 0; ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); /* if IPv4 header checksum is OK, believe it */ if (ip4h->check == ip4h_checked.check) return 4; return 6; } EXPORT_SYMBOL(ib_get_rdma_header_version); static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, u32 port_num, const struct ib_grh *grh) { int grh_version; if (rdma_protocol_ib(device, port_num)) return RDMA_NETWORK_IB; grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh); if (grh_version == 4) return RDMA_NETWORK_IPV4; if (grh->next_hdr == IPPROTO_UDP) return RDMA_NETWORK_IPV6; return RDMA_NETWORK_ROCE_V1; } struct find_gid_index_context { u16 vlan_id; enum ib_gid_type gid_type; }; static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { struct find_gid_index_context *ctx = context; u16 vlan_id = 0xffff; int ret; if (ctx->gid_type != gid_attr->gid_type) return false; ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL); if (ret) return false; return ctx->vlan_id == vlan_id; } static const struct ib_gid_attr * get_sgid_attr_from_eth(struct ib_device *device, u32 port_num, u16 vlan_id, const union ib_gid *sgid, enum ib_gid_type gid_type) { struct find_gid_index_context context = {.vlan_id = vlan_id, .gid_type = gid_type}; return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context); } int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid) { struct sockaddr_in src_in; struct sockaddr_in dst_in; __be32 src_saddr, dst_saddr; if (!sgid || !dgid) return -EINVAL; if (net_type == RDMA_NETWORK_IPV4) { memcpy(&src_in.sin_addr.s_addr, &hdr->roce4grh.saddr, 4); memcpy(&dst_in.sin_addr.s_addr, &hdr->roce4grh.daddr, 4); src_saddr = src_in.sin_addr.s_addr; dst_saddr = dst_in.sin_addr.s_addr; ipv6_addr_set_v4mapped(src_saddr, (struct in6_addr *)sgid); ipv6_addr_set_v4mapped(dst_saddr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || net_type == RDMA_NETWORK_IB || net_type == RDMA_NETWORK_ROCE_V1) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; } else { return -EINVAL; } } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); /* Resolve destination mac address and hop limit for unicast destination * GID entry, considering the source GID entry as well. * ah_attribute must have valid port_num, sgid_index. */ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr); const struct ib_gid_attr *sgid_attr = grh->sgid_attr; int hop_limit = 0xff; int ret = 0; /* If destination is link local and source GID is RoCEv1, * IP stack is not used. */ if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && sgid_attr->gid_type == IB_GID_TYPE_ROCE) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); return ret; } ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; } /* * This function initializes address handle attributes from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is * getting executed and, sgid contains the GID of the sender. * * When resolving mac address of destination, the arrived dgid is used * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * * On success the caller is responsible to call rdma_destroy_ah_attr on the * attr. */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) { u32 flow_class; int ret; enum rdma_network_type net_type = RDMA_NETWORK_IB; enum ib_gid_type gid_type = IB_GID_TYPE_IB; const struct ib_gid_attr *sgid_attr; int hoplimit = 0xff; union ib_gid dgid; union ib_gid sgid; might_sleep(); memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) net_type = wc->network_hdr_type; else net_type = ib_get_net_type_by_grh(device, port_num, grh); gid_type = ib_network_to_gid_type(net_type); } ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, &sgid, &dgid); if (ret) return ret; rdma_ah_set_sl(ah_attr, wc->sl); rdma_ah_set_port_num(ah_attr, port_num); if (rdma_protocol_roce(device, port_num)) { u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; sgid_attr = get_sgid_attr_from_eth(device, port_num, vlan_id, &dgid, gid_type); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); flow_class = be32_to_cpu(grh->version_tclass_flow); rdma_move_grh_sgid_attr(ah_attr, &sgid, flow_class & 0xFFFFF, hoplimit, (flow_class >> 20) & 0xFF, sgid_attr); ret = ib_resolve_unicast_gid_dmac(device, ah_attr); if (ret) rdma_destroy_ah_attr(ah_attr); return ret; } else { rdma_ah_set_dlid(ah_attr, wc->slid); rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); if ((wc->wc_flags & IB_WC_GRH) == 0) return 0; if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { sgid_attr = rdma_find_gid_by_port( device, &dgid, IB_GID_TYPE_IB, port_num, NULL); } else sgid_attr = rdma_get_gid_attr(device, port_num, 0); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); flow_class = be32_to_cpu(grh->version_tclass_flow); rdma_move_grh_sgid_attr(ah_attr, &sgid, flow_class & 0xFFFFF, hoplimit, (flow_class >> 20) & 0xFF, sgid_attr); return 0; } } EXPORT_SYMBOL(ib_init_ah_attr_from_wc); /** * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership * of the reference * * @attr: Pointer to AH attribute structure * @dgid: Destination GID * @flow_label: Flow label * @hop_limit: Hop limit * @traffic_class: traffic class * @sgid_attr: Pointer to SGID attribute * * This takes ownership of the sgid_attr reference. The caller must ensure * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after * calling this function. */ void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr) { rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit, traffic_class); attr->grh.sgid_attr = sgid_attr; } EXPORT_SYMBOL(rdma_move_grh_sgid_attr); /** * rdma_destroy_ah_attr - Release reference to SGID attribute of * ah attribute. * @ah_attr: Pointer to ah attribute * * Release reference to the SGID attribute of the ah attribute if it is * non NULL. It is safe to call this multiple times, and safe to call it on * a zero initialized ah_attr. */ void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) { if (ah_attr->grh.sgid_attr) { rdma_put_gid_attr(ah_attr->grh.sgid_attr); ah_attr->grh.sgid_attr = NULL; } } EXPORT_SYMBOL(rdma_destroy_ah_attr); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num) { struct rdma_ah_attr ah_attr; struct ib_ah *ah; int ret; ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE); rdma_destroy_ah_attr(&ah_attr); return ah; } EXPORT_SYMBOL(ib_create_ah_from_wc); int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { const struct ib_gid_attr *old_sgid_attr; int ret; if (ah->type != ah_attr->type) return -EINVAL; ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr); if (ret) return ret; ret = ah->device->ops.modify_ah ? ah->device->ops.modify_ah(ah, ah_attr) : -EOPNOTSUPP; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ret; } EXPORT_SYMBOL(rdma_modify_ah); int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { ah_attr->grh.sgid_attr = NULL; return ah->device->ops.query_ah ? ah->device->ops.query_ah(ah, ah_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_query_ah); int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; int ret; might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; ret = ah->device->ops.destroy_ah(ah, flags); if (ret) return ret; atomic_dec(&pd->usecnt); if (sgid_attr) rdma_put_gid_attr(sgid_attr); kfree(ah); return ret; } EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ /** * ib_create_srq_user - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * @uobject: uobject pointer if this is not a kernel SRQ * @udata: udata pointer if this is not a kernel SRQ * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata) { struct ib_srq *srq; int ret; srq = rdma_zalloc_drv_obj(pd->device, ib_srq); if (!srq) return ERR_PTR(-ENOMEM); srq->device = pd->device; srq->pd = pd; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; srq->uobject = uobject; if (ib_srq_has_cq(srq->srq_type)) { srq->ext.cq = srq_init_attr->ext.cq; atomic_inc(&srq->ext.cq->usecnt); } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; if (srq->ext.xrc.xrcd) atomic_inc(&srq->ext.xrc.xrcd->usecnt); } atomic_inc(&pd->usecnt); rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ); rdma_restrack_parent_name(&srq->res, &pd->res); ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); if (ret) { rdma_restrack_put(&srq->res); atomic_dec(&pd->usecnt); if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); kfree(srq); return ERR_PTR(ret); } rdma_restrack_add(&srq->res); return srq; } EXPORT_SYMBOL(ib_create_srq_user); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) { return srq->device->ops.modify_srq ? srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask, NULL) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_modify_srq); int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) { return srq->device->ops.query_srq ? srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; ret = srq->device->ops.destroy_srq(srq, udata); if (ret) return ret; atomic_dec(&srq->pd->usecnt); if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); rdma_restrack_del(&srq->res); kfree(srq); return ret; } EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ static void __ib_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = event->element.qp; if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) complete(&qp->srq_completion); if (qp->registered_event_handler) qp->registered_event_handler(event, qp->qp_context); } static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; unsigned long flags; spin_lock_irqsave(&qp->device->qp_open_list_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) if (event->element.qp->event_handler) event->element.qp->event_handler(event, event->element.qp->qp_context); spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags); } static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, void (*event_handler)(struct ib_event *, void *), void *qp_context) { struct ib_qp *qp; unsigned long flags; int err; qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->real_qp = real_qp; err = ib_open_shared_qp_security(qp, real_qp->device); if (err) { kfree(qp); return ERR_PTR(err); } qp->real_qp = real_qp; atomic_inc(&real_qp->usecnt); qp->device = real_qp->device; qp->event_handler = event_handler; qp->qp_context = qp_context; qp->qp_num = real_qp->qp_num; qp->qp_type = real_qp->qp_type; spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_add(&qp->open_list, &real_qp->open_list); spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); return qp; } struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr) { struct ib_qp *qp, *real_qp; if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) return ERR_PTR(-EINVAL); down_read(&xrcd->tgt_qps_rwsem); real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num); if (!real_qp) { up_read(&xrcd->tgt_qps_rwsem); return ERR_PTR(-EINVAL); } qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, qp_open_attr->qp_context); up_read(&xrcd->tgt_qps_rwsem); return qp; } EXPORT_SYMBOL(ib_open_qp); static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; int err; qp->event_handler = __ib_shared_qp_event_handler; qp->qp_context = qp; qp->pd = NULL; qp->send_cq = qp->recv_cq = NULL; qp->srq = NULL; qp->xrcd = qp_init_attr->xrcd; atomic_inc(&qp_init_attr->xrcd->usecnt); INIT_LIST_HEAD(&qp->open_list); qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); if (IS_ERR(qp)) return qp; err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num, real_qp, GFP_KERNEL)); if (err) { ib_close_qp(qp); return ERR_PTR(err); } return qp; } static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata, struct ib_uqp_object *uobj, const char *caller) { struct ib_udata dummy = {}; struct ib_qp *qp; int ret; if (!dev->ops.create_qp) return ERR_PTR(-EOPNOTSUPP); qp = rdma_zalloc_drv_obj_numa(dev, ib_qp); if (!qp) return ERR_PTR(-ENOMEM); qp->device = dev; qp->pd = pd; qp->uobject = uobj; qp->real_qp = qp; qp->qp_type = attr->qp_type; qp->rwq_ind_tbl = attr->rwq_ind_tbl; qp->srq = attr->srq; qp->event_handler = __ib_qp_event_handler; qp->registered_event_handler = attr->event_handler; qp->port = attr->port_num; qp->qp_context = attr->qp_context; spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); init_completion(&qp->srq_completion); qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); rdma_restrack_set_name(&qp->res, udata ? NULL : caller); ret = dev->ops.create_qp(qp, attr, udata); if (ret) goto err_create; /* * TODO: The mlx4 internally overwrites send_cq and recv_cq. * Unfortunately, it is not an easy task to fix that driver. */ qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; ret = ib_create_qp_security(qp, dev); if (ret) goto err_security; rdma_restrack_add(&qp->res); return qp; err_security: qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL); err_create: rdma_restrack_put(&qp->res); kfree(qp); return ERR_PTR(ret); } /** * ib_create_qp_user - Creates a QP associated with the specified protection * domain. * @dev: IB device * @pd: The protection domain associated with the QP. * @attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. * @udata: User data * @uobj: uverbs obect * @caller: caller's build-time module name */ struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata, struct ib_uqp_object *uobj, const char *caller) { struct ib_qp *qp, *xrc_qp; if (attr->qp_type == IB_QPT_XRC_TGT) qp = create_qp(dev, pd, attr, NULL, NULL, caller); else qp = create_qp(dev, pd, attr, udata, uobj, NULL); if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp)) return qp; xrc_qp = create_xrc_qp_user(qp, attr); if (IS_ERR(xrc_qp)) { ib_destroy_qp(qp); return xrc_qp; } xrc_qp->uobject = uobj; return xrc_qp; } EXPORT_SYMBOL(ib_create_qp_user); void ib_qp_usecnt_inc(struct ib_qp *qp) { if (qp->pd) atomic_inc(&qp->pd->usecnt); if (qp->send_cq) atomic_inc(&qp->send_cq->usecnt); if (qp->recv_cq) atomic_inc(&qp->recv_cq->usecnt); if (qp->srq) atomic_inc(&qp->srq->usecnt); if (qp->rwq_ind_tbl) atomic_inc(&qp->rwq_ind_tbl->usecnt); } EXPORT_SYMBOL(ib_qp_usecnt_inc); void ib_qp_usecnt_dec(struct ib_qp *qp) { if (qp->rwq_ind_tbl) atomic_dec(&qp->rwq_ind_tbl->usecnt); if (qp->srq) atomic_dec(&qp->srq->usecnt); if (qp->recv_cq) atomic_dec(&qp->recv_cq->usecnt); if (qp->send_cq) atomic_dec(&qp->send_cq->usecnt); if (qp->pd) atomic_dec(&qp->pd->usecnt); } EXPORT_SYMBOL(ib_qp_usecnt_dec); struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller) { struct ib_device *device = pd->device; struct ib_qp *qp; int ret; /* * If the callers is using the RDMA API calculate the resources * needed for the RDMA READ/WRITE operations. * * Note that these callers need to pass in a port number. */ if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller); if (IS_ERR(qp)) return qp; ib_qp_usecnt_inc(qp); if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); if (ret) goto err; } /* * Note: all hw drivers guarantee that max_send_sge is lower than * the device RDMA WRITE SGE limit but not all hw drivers ensure that * max_send_sge <= max_sge_rd. */ qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) qp->integrity_en = true; return qp; err: ib_destroy_qp(qp); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_create_qp_kernel); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_PORT, [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, }, [IB_QPS_INIT] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, [IB_QPS_RTR] = { .valid = 1, .req_param = { [IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_XRC_TGT] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), }, }, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .req_param = { [IB_QPT_UD] = IB_QP_SQ_PSN, [IB_QPT_UC] = IB_QP_SQ_PSN, [IB_QPT_RC] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | IB_QP_SQ_PSN), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE | IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } } }, [IB_QPS_RTS] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER | IB_QP_RATE_LIMIT), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } }, }, [IB_QPS_SQD] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } } }, [IB_QPS_SQE] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } } }, [IB_QPS_ERR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 } } }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; if (mask & IB_QP_CUR_STATE && cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) return false; if (!qp_state_table[cur_state][next_state].valid) return false; req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; if ((mask & req_param) != req_param) return false; if (mask & ~(req_param | opt_param | IB_QP_STATE)) return false; return true; } EXPORT_SYMBOL(ib_modify_qp_is_ok); /** * ib_resolve_eth_dmac - Resolve destination mac address * @device: Device to consider * @ah_attr: address handle attribute which describes the * source and destination parameters * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It * returns 0 on success or appropriate error code. It initializes the * necessary ah_attr fields when call is successful. */ static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { int ret = 0; if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); } else { ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, (char *)ah_attr->roce.dmac); } } else { ret = ib_resolve_unicast_gid_dmac(device, ah_attr); } return ret; } static bool is_qp_type_connected(const struct ib_qp *qp) { return (qp->qp_type == IB_QPT_UC || qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT); } /* * IB core internal function to perform QP attributes modification. */ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; const struct ib_gid_attr *old_sgid_attr_av; const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; attr->xmit_slave = NULL; if (attr_mask & IB_QP_AV) { ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, &old_sgid_attr_av); if (ret) return ret; if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && is_qp_type_connected(qp)) { struct net_device *slave; /* * If the user provided the qp_attr then we have to * resolve it. Kerne users have to provide already * resolved rdma_ah_attr's. */ if (udata) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) goto out_av; } slave = rdma_lag_get_ah_roce_slave(qp->device, &attr->ah_attr, GFP_KERNEL); if (IS_ERR(slave)) { ret = PTR_ERR(slave); goto out_av; } attr->xmit_slave = slave; } } if (attr_mask & IB_QP_ALT_PATH) { /* * FIXME: This does not track the migration state, so if the * user loads a new alternate path after the HW has migrated * from primary->alternate we will keep the wrong * references. This is OK for IB because the reference * counting does not serve any functional purpose. */ ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr, &old_sgid_attr_alt_av); if (ret) goto out_av; /* * Today the core code can only handle alternate paths and APM * for IB. Ban them in roce mode. */ if (!(rdma_protocol_ib(qp->device, attr->alt_ah_attr.port_num) && rdma_protocol_ib(qp->device, port))) { ret = -EINVAL; goto out; } } if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { dev_warn(&qp->device->dev, "%s rq_psn overflow, masking to 24 bits\n", __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { dev_warn(&qp->device->dev, " %s sq_psn overflow, masking to 24 bits\n", __func__); attr->sq_psn &= 0xffffff; } } /* * Bind this qp to a counter automatically based on the rdma counter * rules. This only set in RST2INIT with port specified */ if (!qp->counter && (attr_mask & IB_QP_PORT) && ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) rdma_counter_bind_qp_auto(qp, attr->port_num); ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (ret) goto out; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_AV) qp->av_sgid_attr = rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr); if (attr_mask & IB_QP_ALT_PATH) qp->alt_path_sgid_attr = rdma_update_sgid_attr( &attr->alt_ah_attr, qp->alt_path_sgid_attr); out: if (attr_mask & IB_QP_ALT_PATH) rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); out_av: if (attr_mask & IB_QP_AV) { rdma_lag_put_ah_roce_slave(attr->xmit_slave); rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); } return ret; } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @ib_qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata); } EXPORT_SYMBOL(ib_modify_qp_with_udata); static void ib_get_width_and_speed(u32 netdev_speed, u32 lanes, u16 *speed, u8 *width) { if (!lanes) { if (netdev_speed <= SPEED_1000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_SDR; } else if (netdev_speed <= SPEED_10000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_FDR10; } else if (netdev_speed <= SPEED_20000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_DDR; } else if (netdev_speed <= SPEED_25000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_40000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_FDR10; } else if (netdev_speed <= SPEED_50000) { *width = IB_WIDTH_2X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_100000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_200000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_HDR; } else { *width = IB_WIDTH_4X; *speed = IB_SPEED_NDR; } return; } switch (lanes) { case 1: *width = IB_WIDTH_1X; break; case 2: *width = IB_WIDTH_2X; break; case 4: *width = IB_WIDTH_4X; break; case 8: *width = IB_WIDTH_8X; break; case 12: *width = IB_WIDTH_12X; break; default: *width = IB_WIDTH_1X; } switch (netdev_speed / lanes) { case SPEED_2500: *speed = IB_SPEED_SDR; break; case SPEED_5000: *speed = IB_SPEED_DDR; break; case SPEED_10000: *speed = IB_SPEED_FDR10; break; case SPEED_14000: *speed = IB_SPEED_FDR; break; case SPEED_25000: *speed = IB_SPEED_EDR; break; case SPEED_50000: *speed = IB_SPEED_HDR; break; case SPEED_100000: *speed = IB_SPEED_NDR; break; default: *speed = IB_SPEED_SDR; } } int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width) { int rc; u32 netdev_speed; struct net_device *netdev; struct ethtool_link_ksettings lksettings = {}; if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; rtnl_lock(); rc = __ethtool_get_link_ksettings(netdev, &lksettings); rtnl_unlock(); dev_put(netdev); if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { netdev_speed = lksettings.base.speed; } else { netdev_speed = SPEED_1000; if (rc) pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name, netdev_speed); } ib_get_width_and_speed(netdev_speed, lksettings.lanes, speed, width); return 0; } EXPORT_SYMBOL(ib_get_eth_speed); int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { qp_attr->ah_attr.grh.sgid_attr = NULL; qp_attr->alt_ah_attr.grh.sgid_attr = NULL; return qp->device->ops.query_qp ? qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_qp); int ib_close_qp(struct ib_qp *qp) { struct ib_qp *real_qp; unsigned long flags; real_qp = qp->real_qp; if (real_qp == qp) return -EINVAL; spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_del(&qp->open_list); spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); atomic_dec(&real_qp->usecnt); if (qp->qp_sec) ib_close_shared_qp_security(qp->qp_sec); kfree(qp); return 0; } EXPORT_SYMBOL(ib_close_qp); static int __ib_destroy_shared_qp(struct ib_qp *qp) { struct ib_xrcd *xrcd; struct ib_qp *real_qp; int ret; real_qp = qp->real_qp; xrcd = real_qp->xrcd; down_write(&xrcd->tgt_qps_rwsem); ib_close_qp(qp); if (atomic_read(&real_qp->usecnt) == 0) xa_erase(&xrcd->tgt_qps, real_qp->qp_num); else real_qp = NULL; up_write(&xrcd->tgt_qps_rwsem); if (real_qp) { ret = ib_destroy_qp(real_qp); if (!ret) atomic_dec(&xrcd->usecnt); } return 0; } int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) { const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; struct ib_qp_security *sec; int ret; WARN_ON_ONCE(qp->mrs_used > 0); if (atomic_read(&qp->usecnt)) return -EBUSY; if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); sec = qp->qp_sec; if (sec) ib_destroy_qp_security_begin(sec); if (!qp->uobject) rdma_rw_cleanup_mrs(qp); rdma_counter_unbind_qp(qp, qp->port, true); ret = qp->device->ops.destroy_qp(qp, udata); if (ret) { if (sec) ib_destroy_qp_security_abort(sec); return ret; } if (alt_path_sgid_attr) rdma_put_gid_attr(alt_path_sgid_attr); if (av_sgid_attr) rdma_put_gid_attr(av_sgid_attr); ib_qp_usecnt_dec(qp); if (sec) ib_destroy_qp_security_end(sec); rdma_restrack_del(&qp->res); kfree(qp); return ret; } EXPORT_SYMBOL(ib_destroy_qp_user); /* Completion queues */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller) { struct ib_cq *cq; int ret; cq = rdma_zalloc_drv_obj(device, ib_cq); if (!cq) return ERR_PTR(-ENOMEM); cq->device = device; cq->uobject = NULL; cq->comp_handler = comp_handler; cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, caller); ret = device->ops.create_cq(cq, cq_attr, NULL); if (ret) { rdma_restrack_put(&cq->res); kfree(cq); return ERR_PTR(ret); } rdma_restrack_add(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { if (cq->shared) return -EOPNOTSUPP; return cq->device->ops.modify_cq ? cq->device->ops.modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_set_cq_moderation); int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { int ret; if (WARN_ON_ONCE(cq->shared)) return -EOPNOTSUPP; if (atomic_read(&cq->usecnt)) return -EBUSY; ret = cq->device->ops.destroy_cq(cq, udata); if (ret) return ret; rdma_restrack_del(&cq->res); kfree(cq); return ret; } EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { if (cq->shared) return -EOPNOTSUPP; return cq->device->ops.resize_cq ? cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags) { struct ib_mr *mr; if (access_flags & IB_ACCESS_ON_DEMAND) { if (!(pd->device->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); return ERR_PTR(-EINVAL); } } mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, access_flags, NULL, NULL); if (IS_ERR(mr)) return mr; mr->device = pd->device; mr->type = IB_MR_TYPE_USER; mr->pd = pd; mr->dm = NULL; atomic_inc(&pd->usecnt); mr->iova = virt_addr; mr->length = length; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); return mr; } EXPORT_SYMBOL(ib_reg_user_mr); int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge) { if (!pd->device->ops.advise_mr) return -EOPNOTSUPP; if (!num_sge) return 0; return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, NULL); } EXPORT_SYMBOL(ib_advise_mr); int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; struct ib_dmah *dmah = mr->dmah; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; trace_mr_dereg(mr); rdma_restrack_del(&mr->res); ret = mr->device->ops.dereg_mr(mr, udata); if (!ret) { atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); if (dmah) atomic_dec(&dmah->usecnt); kfree(sig_attrs); } return ret; } EXPORT_SYMBOL(ib_dereg_mr_user); /** * ib_alloc_mr() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. * * Notes: * Memory registeration page/sg lists must not exceed max_num_sg. * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed * max_num_sg * used_page_size. * */ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) { struct ib_mr *mr; if (!pd->device->ops.alloc_mr) { mr = ERR_PTR(-EOPNOTSUPP); goto out; } if (mr_type == IB_MR_TYPE_INTEGRITY) { WARN_ON_ONCE(1); mr = ERR_PTR(-EINVAL); goto out; } mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); if (IS_ERR(mr)) goto out; mr->device = pd->device; mr->pd = pd; mr->dm = NULL; mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; mr->type = mr_type; mr->sig_attrs = NULL; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); out: trace_mr_alloc(pd, mr_type, max_num_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr); /** * ib_alloc_mr_integrity() - Allocates an integrity memory region * @pd: protection domain associated with the region * @max_num_data_sg: maximum data sg entries available for registration * @max_num_meta_sg: maximum metadata sg entries available for * registration * * Notes: * Memory registration page/sg lists must not exceed max_num_sg, * also the integrity page/sg lists must not exceed max_num_meta_sg. * */ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg) { struct ib_mr *mr; struct ib_sig_attrs *sig_attrs; if (!pd->device->ops.alloc_mr_integrity || !pd->device->ops.map_mr_sg_pi) { mr = ERR_PTR(-EOPNOTSUPP); goto out; } if (!max_num_meta_sg) { mr = ERR_PTR(-EINVAL); goto out; } sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); if (!sig_attrs) { mr = ERR_PTR(-ENOMEM); goto out; } mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, max_num_meta_sg); if (IS_ERR(mr)) { kfree(sig_attrs); goto out; } mr->device = pd->device; mr->pd = pd; mr->dm = NULL; mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; mr->type = IB_MR_TYPE_INTEGRITY; mr->sig_attrs = sig_attrs; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); out: trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr_integrity); /* Multicast groups */ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) { struct ib_qp_init_attr init_attr = {}; struct ib_qp_attr attr = {}; int num_eth_ports = 0; unsigned int port; /* If QP state >= init, it is assigned to a port and we can check this * port only. */ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { if (attr.qp_state >= IB_QPS_INIT) { if (rdma_port_get_link_layer(qp->device, attr.port_num) != IB_LINK_LAYER_INFINIBAND) return true; goto lid_check; } } /* Can't get a quick answer, iterate over all ports */ rdma_for_each_port(qp->device, port) if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; /* If we have at lease one Ethernet port, RoCE annex declares that * multicast LID should be ignored. We can't tell at this step if the * QP belongs to an IB or Ethernet port. */ if (num_eth_ports) return true; /* If all the ports are IB, we can check according to IB spec. */ lid_check: return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || lid == be16_to_cpu(IB_LID_PERMISSIVE)); } int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->ops.attach_mcast) return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->ops.attach_mcast(qp, gid, lid); if (!ret) atomic_inc(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_attach_mcast); int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->ops.detach_mcast) return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->ops.detach_mcast(qp, gid, lid); if (!ret) atomic_dec(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_detach_mcast); /** * ib_alloc_xrcd_user - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. * @inode: inode to connect XRCD * @udata: Valid user data or NULL for kernel object */ struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata) { struct ib_xrcd *xrcd; int ret; if (!device->ops.alloc_xrcd) return ERR_PTR(-EOPNOTSUPP); xrcd = rdma_zalloc_drv_obj(device, ib_xrcd); if (!xrcd) return ERR_PTR(-ENOMEM); xrcd->device = device; xrcd->inode = inode; atomic_set(&xrcd->usecnt, 0); init_rwsem(&xrcd->tgt_qps_rwsem); xa_init(&xrcd->tgt_qps); ret = device->ops.alloc_xrcd(xrcd, udata); if (ret) goto err; return xrcd; err: kfree(xrcd); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_xrcd_user); /** * ib_dealloc_xrcd_user - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. * @udata: Valid user data or NULL for kernel object */ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) { int ret; if (atomic_read(&xrcd->usecnt)) return -EBUSY; WARN_ON(!xa_empty(&xrcd->tgt_qps)); ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); if (ret) return ret; kfree(xrcd); return ret; } EXPORT_SYMBOL(ib_dealloc_xrcd_user); /** * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. * @wq_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * * wq_attr->max_wr and wq_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be * at least as large as the requested values. */ struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *wq_attr) { struct ib_wq *wq; if (!pd->device->ops.create_wq) return ERR_PTR(-EOPNOTSUPP); wq = pd->device->ops.create_wq(pd, wq_attr, NULL); if (!IS_ERR(wq)) { wq->event_handler = wq_attr->event_handler; wq->wq_context = wq_attr->wq_context; wq->wq_type = wq_attr->wq_type; wq->cq = wq_attr->cq; wq->device = pd->device; wq->pd = pd; wq->uobject = NULL; atomic_inc(&pd->usecnt); atomic_inc(&wq_attr->cq->usecnt); atomic_set(&wq->usecnt, 0); } return wq; } EXPORT_SYMBOL(ib_create_wq); /** * ib_destroy_wq_user - Destroys the specified user WQ. * @wq: The WQ to destroy. * @udata: Valid user data */ int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) { struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; int ret; if (atomic_read(&wq->usecnt)) return -EBUSY; ret = wq->device->ops.destroy_wq(wq, udata); if (ret) return ret; atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); return ret; } EXPORT_SYMBOL(ib_destroy_wq_user); int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { if (!mr->device->ops.check_mr_status) return -EOPNOTSUPP; return mr->device->ops.check_mr_status(mr, check_mask, mr_status); } EXPORT_SYMBOL(ib_check_mr_status); int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state) { if (!device->ops.set_vf_link_state) return -EOPNOTSUPP; return device->ops.set_vf_link_state(device, vf, port, state); } EXPORT_SYMBOL(ib_set_vf_link_state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info) { if (!device->ops.get_vf_config) return -EOPNOTSUPP; return device->ops.get_vf_config(device, vf, port, info); } EXPORT_SYMBOL(ib_get_vf_config); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats) { if (!device->ops.get_vf_stats) return -EOPNOTSUPP; return device->ops.get_vf_stats(device, vf, port, stats); } EXPORT_SYMBOL(ib_get_vf_stats); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type) { if (!device->ops.set_vf_guid) return -EOPNOTSUPP; return device->ops.set_vf_guid(device, vf, port, guid, type); } EXPORT_SYMBOL(ib_set_vf_guid); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { if (!device->ops.get_vf_guid) return -EOPNOTSUPP; return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid); } EXPORT_SYMBOL(ib_get_vf_guid); /** * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection * information) and set an appropriate memory region for registration. * @mr: memory region * @data_sg: dma mapped scatterlist for data * @data_sg_nents: number of entries in data_sg * @data_sg_offset: offset in bytes into data_sg * @meta_sg: dma mapped scatterlist for metadata * @meta_sg_nents: number of entries in meta_sg * @meta_sg_offset: offset in bytes into meta_sg * @page_size: page vector desired page size * * Constraints: * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. * * Return: 0 on success. * * After this completes successfully, the memory region * is ready for registration. */ int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size) { if (unlikely(!mr->device->ops.map_mr_sg_pi || WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) return -EOPNOTSUPP; mr->page_size = page_size; return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, data_sg_offset, meta_sg, meta_sg_nents, meta_sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg_pi); /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: * * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a * non-contiguous offset, the mapping prefix will not include it. * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * * After this completes successfully, the memory region * is ready for registration. */ int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->ops.map_mr_sg)) return -EOPNOTSUPP; mr->page_size = page_size; return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); /** * ib_sg_to_pages() - Convert the largest prefix of a sg list * to a page vector * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset_p: ==== ======================================================= * IN start offset in bytes into sg * OUT offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. * ==== ======================================================= * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest * prefix of given sg list to a page vector. The sg list * prefix converted is the prefix that meet the requirements * of ib_map_mr_sg. * * Returns the number of sg elements that were assigned to * a page vector. */ int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) return -EINVAL; mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { u64 dma_addr = sg_dma_address(sg) + sg_offset; u64 prev_addr = dma_addr; unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; /* * For the second and later elements, check whether either the * end of element i-1 or the start of element i is not aligned * on a page boundary. */ if (i && (last_page_off != 0 || page_addr != dma_addr)) { /* Stop mapping if there is a gap. */ if (last_end_dma_addr != dma_addr) break; /* * Coalesce this element with the last. If it is small * enough just update mr->length. Otherwise start * mapping from the next page. */ goto next_page; } do { ret = set_page(mr, page_addr); if (unlikely(ret < 0)) { sg_offset = prev_addr - sg_dma_address(sg); mr->length += prev_addr - dma_addr; if (sg_offset_p) *sg_offset_p = sg_offset; return i || sg_offset ? i : ret; } prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; sg_offset = 0; } if (sg_offset_p) *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); struct ib_drain_cqe { struct ib_cqe cqe; struct completion done; }; static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, cqe); complete(&cqe->done); } /* * Post a WR and block until its completion is reaped for the SQ. */ static void __ib_drain_sq(struct ib_qp *qp) { struct ib_cq *cq = qp->send_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; struct ib_rdma_wr swr = { .wr = { .next = NULL, { .wr_cqe = &sdrain.cqe, }, .opcode = IB_WR_RDMA_WRITE, }, }; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); ret = ib_post_send(qp, &swr.wr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } if (cq->poll_ctx == IB_POLL_DIRECT) while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0) ib_process_cq_direct(cq, -1); else wait_for_completion(&sdrain.done); } /* * Post a WR and block until its completion is reaped for the RQ. */ static void __ib_drain_rq(struct ib_qp *qp) { struct ib_cq *cq = qp->recv_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe rdrain; struct ib_recv_wr rwr = {}; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } rwr.wr_cqe = &rdrain.cqe; rdrain.cqe.done = ib_drain_qp_done; init_completion(&rdrain.done); ret = ib_post_recv(qp, &rwr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } if (cq->poll_ctx == IB_POLL_DIRECT) while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0) ib_process_cq_direct(cq, -1); else wait_for_completion(&rdrain.done); } /* * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout * expires. * @qp: queue pair associated with SRQ to drain * * Quoting 10.3.1 Queue Pair and EE Context States: * * Note, for QPs that are associated with an SRQ, the Consumer should take the * QP through the Error State before invoking a Destroy QP or a Modify QP to the * Reset State. The Consumer may invoke the Destroy QP without first performing * a Modify QP to the Error State and waiting for the Affiliated Asynchronous * Last WQE Reached Event. However, if the Consumer does not wait for the * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment * leakage may occur. Therefore, it is good programming practice to tear down a * QP that is associated with an SRQ by using the following process: * * - Put the QP in the Error State * - Wait for the Affiliated Asynchronous Last WQE Reached Event; * - either: * drain the CQ by invoking the Poll CQ verb and either wait for CQ * to be empty or the number of Poll CQ operations has exceeded * CQ capacity size; * - or * post another WR that completes on the same CQ and wait for this * WR to return as a WC; * - and then invoke a Destroy QP or Reset QP. * * We use the first option. */ static void __ib_drain_srq(struct ib_qp *qp) { struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_cq *cq; int n, polled = 0; int ret; if (!qp->srq) { WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp); return; } ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret); return; } if (ib_srq_has_cq(qp->srq->srq_type)) { cq = qp->srq->ext.cq; } else if (qp->recv_cq) { cq = qp->recv_cq; } else { WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp); return; } if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) { while (polled != cq->cqe) { n = ib_process_cq_direct(cq, cq->cqe - polled); if (!n) return; polled += n; } } } /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_sq(). * * The caller must: * * ensure there is room in the CQ and SQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_sq(struct ib_qp *qp) { if (qp->device->ops.drain_sq) qp->device->ops.drain_sq(qp); else __ib_drain_sq(qp); trace_cq_drain_complete(qp->send_cq); } EXPORT_SYMBOL(ib_drain_sq); /** * ib_drain_rq() - Block until all RQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_rq(). * * The caller must: * * ensure there is room in the CQ and RQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_rq(struct ib_qp *qp) { if (qp->device->ops.drain_rq) qp->device->ops.drain_rq(qp); else __ib_drain_rq(qp); trace_cq_drain_complete(qp->recv_cq); } EXPORT_SYMBOL(ib_drain_rq); /** * ib_drain_qp() - Block until all CQEs have been consumed by the * application on both the RQ and SQ. * @qp: queue pair to drain * * The caller must: * * ensure there is room in the CQ(s), SQ, and RQ for drain work requests * and completions. * * allocate the CQs using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_qp(struct ib_qp *qp) { ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); else __ib_drain_srq(qp); } EXPORT_SYMBOL(ib_drain_qp); struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)) { struct rdma_netdev_alloc_params params; struct net_device *netdev; int rc; if (!device->ops.rdma_netdev_get_params) return ERR_PTR(-EOPNOTSUPP); rc = device->ops.rdma_netdev_get_params(device, port_num, type, ¶ms); if (rc) return ERR_PTR(rc); netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type, setup, params.txqs, params.rxqs); if (!netdev) return ERR_PTR(-ENOMEM); return netdev; } EXPORT_SYMBOL(rdma_alloc_netdev); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev) { struct rdma_netdev_alloc_params params; int rc; if (!device->ops.rdma_netdev_get_params) return -EOPNOTSUPP; rc = device->ops.rdma_netdev_get_params(device, port_num, type, ¶ms); if (rc) return rc; return params.initialize_rdma_netdev(device, port_num, netdev, params.param); } EXPORT_SYMBOL(rdma_init_netdev); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz) { memset(biter, 0, sizeof(struct ib_block_iter)); biter->__sg = sglist; biter->__sg_nents = nents; /* Driver provides best block size to use */ biter->__pg_bit = __fls(pgsz); } EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; unsigned int delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); delta = BIT_ULL(biter->__pg_bit) - block_offset; while (biter->__sg_nents && biter->__sg && sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; } biter->__sg_advance += delta; return true; } EXPORT_SYMBOL(__rdma_block_iter_next); /** * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct * for the drivers. * @descs: array of static descriptors * @num_counters: number of elements in array * @lifespan: milliseconds between updates */ struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan) { struct rdma_hw_stats *stats; stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL); if (!stats) return NULL; stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters), sizeof(*stats->is_disabled), GFP_KERNEL); if (!stats->is_disabled) goto err; stats->descs = descs; stats->num_counters = num_counters; stats->lifespan = msecs_to_jiffies(lifespan); mutex_init(&stats->lock); return stats; err: kfree(stats); return NULL; } EXPORT_SYMBOL(rdma_alloc_hw_stats_struct); /** * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats * @stats: statistics to release */ void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats) { if (!stats) return; kfree(stats->is_disabled); kfree(stats); } EXPORT_SYMBOL(rdma_free_hw_stats_struct); |
| 232 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Persistent object (dat entry/disk inode) allocator/deallocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Originally written by Koji Sato. * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #ifndef _NILFS_ALLOC_H #define _NILFS_ALLOC_H #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> /** * nilfs_palloc_entries_per_group - get the number of entries per group * @inode: inode of metadata file using this allocator * * The number of entries per group is defined by the number of bits * that a bitmap block can maintain. * * Return: Number of entries per group. */ static inline unsigned long nilfs_palloc_entries_per_group(const struct inode *inode) { return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); } int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, const struct buffer_head *bh); int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); /** * struct nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) * @pr_desc_bh: buffer head of the buffer containing block group descriptors * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap * @pr_entry_bh: buffer head of the buffer containing translation entries */ struct nilfs_palloc_req { __u64 pr_entry_nr; struct buffer_head *pr_desc_bh; struct buffer_head *pr_bitmap_bh; struct buffer_head *pr_entry_bh; }; int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req, bool wrap); void nilfs_palloc_commit_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_set_bit_atomic ext2_set_bit_atomic #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le #define nilfs_find_next_bit find_next_bit_le /** * struct nilfs_bh_assoc - block offset and buffer head association * @blkoff: block offset * @bh: buffer head */ struct nilfs_bh_assoc { unsigned long blkoff; struct buffer_head *bh; }; /** * struct nilfs_palloc_cache - persistent object allocator cache * @lock: cache protecting lock * @prev_desc: blockgroup descriptors cache * @prev_bitmap: blockgroup bitmap cache * @prev_entry: translation entries cache */ struct nilfs_palloc_cache { spinlock_t lock; struct nilfs_bh_assoc prev_desc; struct nilfs_bh_assoc prev_bitmap; struct nilfs_bh_assoc prev_entry; }; void nilfs_palloc_setup_cache(struct inode *inode, struct nilfs_palloc_cache *cache); void nilfs_palloc_clear_cache(struct inode *inode); void nilfs_palloc_destroy_cache(struct inode *inode); #endif /* _NILFS_ALLOC_H */ |
| 58 4 1 52 19 1 2 30 8 8 11 1 3 1 6 6 9 9 28 28 27 6 8 11 2 27 28 30 30 30 1 27 1 56 58 52 30 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue> * * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. */ #include <linux/skbuff.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/addrconf.h> #include <net/ip6_checksum.h> static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) { const struct ipv6hdr *ip6h; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); if (!pskb_may_pull(skb, offset)) return -EINVAL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return -EINVAL; len = offset + ntohs(ip6h->payload_len); if (skb->len < len || len <= offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ipv6_mc_check_exthdrs(struct sk_buff *skb) { const struct ipv6hdr *ip6h; int offset; u8 nexthdr; __be16 frag_off; ip6h = ipv6_hdr(skb); if (ip6h->nexthdr != IPPROTO_HOPOPTS) return -ENOMSG; nexthdr = ip6h->nexthdr; offset = skb_network_offset(skb) + sizeof(*ip6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return -EINVAL; if (nexthdr != IPPROTO_ICMPV6) return -ENOMSG; skb_set_transport_header(skb, offset); return 0; } static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct mld2_report); return ipv6_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ipv6_mc_check_mld_query(struct sk_buff *skb) { unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; unsigned int len; /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) return -EINVAL; /* MLDv1? */ if (transport_len != sizeof(struct mld_msg)) { /* or MLDv2? */ if (transport_len < sizeof(struct mld2_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct mld2_query); if (!ipv6_mc_may_pull(skb, len)) return -EINVAL; } mld = (struct mld_msg *)skb_transport_header(skb); /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer * all-nodes destination address (ff02::1) for general queries */ if (ipv6_addr_any(&mld->mld_mca) && !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) return -EINVAL; return 0; } static int ipv6_mc_check_mld_msg(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); struct mld_msg *mld; if (!ipv6_mc_may_pull(skb, len)) return -ENODATA; mld = (struct mld_msg *)skb_transport_header(skb); switch (mld->mld_type) { case ICMPV6_MGM_REDUCTION: case ICMPV6_MGM_REPORT: return 0; case ICMPV6_MLD2_REPORT: return ipv6_mc_check_mld_reportv2(skb); case ICMPV6_MGM_QUERY: return ipv6_mc_check_mld_query(skb); default: return -ENODATA; } } static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } static int ipv6_mc_check_icmpv6(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr); unsigned int transport_len = ipv6_transport_len(skb); struct sk_buff *skb_chk; if (!ipv6_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ipv6_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet * @skb: the skb to validate * * Checks whether an IPv6 packet is a valid MLD packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet * with a hop-by-hop option. * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded * but it is not an MLD packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ipv6_mc_check_mld(struct sk_buff *skb) { int ret; ret = ipv6_mc_check_ip6hdr(skb); if (ret < 0) return ret; ret = ipv6_mc_check_exthdrs(skb); if (ret < 0) return ret; ret = ipv6_mc_check_icmpv6(skb); if (ret < 0) return ret; return ipv6_mc_check_mld_msg(skb); } EXPORT_SYMBOL(ipv6_mc_check_mld); |
| 17 17 17 17 16 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 | // SPDX-License-Identifier: GPL-2.0 /* * PCI Message Signaled Interrupt (MSI) - irqdomain support */ #include <linux/acpi_iort.h> #include <linux/irqdomain.h> #include <linux/of_irq.h> #include "msi.h" int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { struct irq_domain *domain; domain = dev_get_msi_domain(&dev->dev); if (domain && irq_domain_is_hierarchy(domain)) return msi_domain_alloc_irqs_all_locked(&dev->dev, MSI_DEFAULT_DOMAIN, nvec); return pci_msi_legacy_setup_msi_irqs(dev, nvec, type); } void pci_msi_teardown_msi_irqs(struct pci_dev *dev) { struct irq_domain *domain; domain = dev_get_msi_domain(&dev->dev); if (domain && irq_domain_is_hierarchy(domain)) { msi_domain_free_irqs_all_locked(&dev->dev, MSI_DEFAULT_DOMAIN); } else { pci_msi_legacy_teardown_msi_irqs(dev); msi_free_msi_descs(&dev->dev); } } /** * pci_msi_domain_write_msg - Helper to write MSI message to PCI config space * @irq_data: Pointer to interrupt data of the MSI interrupt * @msg: Pointer to the message */ static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg) { struct msi_desc *desc = irq_data_get_msi_desc(irq_data); /* * For MSI-X desc->irq is always equal to irq_data->irq. For * MSI only the first interrupt of MULTI MSI passes the test. */ if (desc->irq == irq_data->irq) __pci_write_msi_msg(desc, msg); } /* * Per device MSI[-X] domain functionality */ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { arg->desc = desc; arg->hwirq = desc->msi_index; } static void cond_shutdown_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) irq_chip_shutdown_parent(data); else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_mask_parent(data); } static unsigned int cond_startup_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) return irq_chip_startup_parent(data); else if (unlikely(info->flags & MSI_FLAG_PCI_MSI_MASK_PARENT)) irq_chip_unmask_parent(data); return 0; } static void pci_irq_shutdown_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_mask(desc, BIT(data->irq - desc->irq)); cond_shutdown_parent(data); } static unsigned int pci_irq_startup_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); unsigned int ret = cond_startup_parent(data); pci_msi_unmask(desc, BIT(data->irq - desc->irq)); return ret; } static void pci_irq_mask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_mask(desc, BIT(data->irq - desc->irq)); } static void pci_irq_unmask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); pci_msi_unmask(desc, BIT(data->irq - desc->irq)); } #ifdef CONFIG_GENERIC_IRQ_RESERVATION_MODE # define MSI_REACTIVATE MSI_FLAG_MUST_REACTIVATE #else # define MSI_REACTIVATE 0 #endif #define MSI_COMMON_FLAGS (MSI_FLAG_FREE_MSI_DESCS | \ MSI_FLAG_ACTIVATE_EARLY | \ MSI_FLAG_DEV_SYSFS | \ MSI_REACTIVATE) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", .irq_startup = pci_irq_startup_msi, .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_ONESHOT_SAFE, }, .ops = { .set_desc = pci_device_domain_set_desc, }, .info = { .flags = MSI_COMMON_FLAGS | MSI_FLAG_MULTI_PCI_MSI, .bus_token = DOMAIN_BUS_PCI_DEVICE_MSI, }, }; static void pci_irq_shutdown_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); cond_shutdown_parent(data); } static unsigned int pci_irq_startup_msix(struct irq_data *data) { unsigned int ret = cond_startup_parent(data); pci_msix_unmask(irq_data_get_msi_desc(data)); return ret; } static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); } static void pci_irq_unmask_msix(struct irq_data *data) { pci_msix_unmask(irq_data_get_msi_desc(data)); } void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg, struct msi_desc *desc) { /* Don't fiddle with preallocated MSI descriptors */ if (!desc->pci.mask_base) msix_prepare_msi_desc(to_pci_dev(desc->dev), desc); } EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", .irq_startup = pci_irq_startup_msix, .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, .flags = IRQCHIP_ONESHOT_SAFE, }, .ops = { .prepare_desc = pci_msix_prepare_desc, .set_desc = pci_device_domain_set_desc, }, .info = { .flags = MSI_COMMON_FLAGS | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN, .bus_token = DOMAIN_BUS_PCI_DEVICE_MSIX, }, }; static bool pci_match_device_domain(struct pci_dev *pdev, enum irq_domain_bus_token bus_token) { return msi_match_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN, bus_token); } static bool pci_create_device_domain(struct pci_dev *pdev, const struct msi_domain_template *tmpl, unsigned int hwsize) { struct irq_domain *domain = dev_get_msi_domain(&pdev->dev); if (!domain || !irq_domain_is_msi_parent(domain)) return true; return msi_create_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN, tmpl, hwsize, NULL, NULL); } /** * pci_setup_msi_device_domain - Setup a device MSI interrupt domain * @pdev: The PCI device to create the domain on * @hwsize: The maximum number of MSI vectors * * Return: * True when: * - The device does not have a MSI parent irq domain associated, * which keeps the legacy architecture specific and the global * PCI/MSI domain models working * - The MSI domain exists already * - The MSI domain was successfully allocated * False when: * - MSI-X is enabled * - The domain creation fails. * * The created MSI domain is preserved until: * - The device is removed * - MSI is disabled and a MSI-X domain is created */ bool pci_setup_msi_device_domain(struct pci_dev *pdev, unsigned int hwsize) { if (WARN_ON_ONCE(pdev->msix_enabled)) return false; if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSI)) return true; if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX)) msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN); return pci_create_device_domain(pdev, &pci_msi_template, hwsize); } /** * pci_setup_msix_device_domain - Setup a device MSI-X interrupt domain * @pdev: The PCI device to create the domain on * @hwsize: The size of the MSI-X vector table * * Return: * True when: * - The device does not have a MSI parent irq domain associated, * which keeps the legacy architecture specific and the global * PCI/MSI domain models working * - The MSI-X domain exists already * - The MSI-X domain was successfully allocated * False when: * - MSI is enabled * - The domain creation fails. * * The created MSI-X domain is preserved until: * - The device is removed * - MSI-X is disabled and a MSI domain is created */ bool pci_setup_msix_device_domain(struct pci_dev *pdev, unsigned int hwsize) { if (WARN_ON_ONCE(pdev->msi_enabled)) return false; if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSIX)) return true; if (pci_match_device_domain(pdev, DOMAIN_BUS_PCI_DEVICE_MSI)) msi_remove_device_irq_domain(&pdev->dev, MSI_DEFAULT_DOMAIN); return pci_create_device_domain(pdev, &pci_msix_template, hwsize); } /** * pci_msi_domain_supports - Check for support of a particular feature flag * @pdev: The PCI device to operate on * @feature_mask: The feature mask to check for (full match) * @mode: If ALLOW_LEGACY this grants the feature when there is no irq domain * associated to the device. If DENY_LEGACY the lack of an irq domain * makes the feature unsupported */ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask, enum support_mode mode) { struct msi_domain_info *info; struct irq_domain *domain; unsigned int supported; domain = dev_get_msi_domain(&pdev->dev); if (!domain || !irq_domain_is_hierarchy(domain)) { if (IS_ENABLED(CONFIG_PCI_MSI_ARCH_FALLBACKS)) return mode == ALLOW_LEGACY; return false; } if (!irq_domain_is_msi_parent(domain)) { /* * For "global" PCI/MSI interrupt domains the associated * msi_domain_info::flags is the authoritative source of * information. */ info = domain->host_data; supported = info->flags; } else { /* * For MSI parent domains the supported feature set * is available in the parent ops. This makes checks * possible before actually instantiating the * per device domain because the parent is never * expanding the PCI/MSI functionality. */ supported = domain->msi_parent_ops->supported_flags; } return (supported & feature_mask) == feature_mask; } /* * Users of the generic MSI infrastructure expect a device to have a single ID, * so with DMA aliases we have to pick the least-worst compromise. Devices with * DMA phantom functions tend to still emit MSIs from the real function number, * so we ignore those and only consider topological aliases where either the * alias device or RID appears on a different bus number. We also make the * reasonable assumption that bridges are walked in an upstream direction (so * the last one seen wins), and the much braver assumption that the most likely * case is that of PCI->PCIe so we should always use the alias RID. This echoes * the logic from intel_irq_remapping's set_msi_sid(), which presumably works * well enough in practice; in the face of the horrible PCIe<->PCI-X conditions * for taking ownership all we can really do is close our eyes and hope... */ static int get_msi_id_cb(struct pci_dev *pdev, u16 alias, void *data) { u32 *pa = data; u8 bus = PCI_BUS_NUM(*pa); if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) *pa = alias; return 0; } /** * pci_msi_domain_get_msi_rid - Get the MSI requester id (RID) * @domain: The interrupt domain * @pdev: The PCI device. * * The RID for a device is formed from the alias, with a firmware * supplied mapping applied * * Returns: The RID. */ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) { struct device_node *of_node; u32 rid = pci_dev_id(pdev); pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); rid = of_node ? of_msi_xlate(&pdev->dev, &of_node, rid) : iort_msi_map_id(&pdev->dev, rid); return rid; } /** * pci_msi_map_rid_ctlr_node - Get the MSI controller fwnode_handle and MSI requester id (RID) * @domain: The interrupt domain * @pdev: The PCI device * @node: Pointer to store the MSI controller fwnode_handle * * Use the firmware data to find the MSI controller fwnode_handle for @pdev. * If found map the RID and initialize @node with it. @node value must * be set to NULL on entry. * * Returns: The RID. */ u32 pci_msi_map_rid_ctlr_node(struct irq_domain *domain, struct pci_dev *pdev, struct fwnode_handle **node) { u32 rid = pci_dev_id(pdev); pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); /* Check whether the domain fwnode is an OF node */ if (irq_domain_get_of_node(domain)) { struct device_node *msi_ctlr_node = NULL; rid = of_msi_xlate(&pdev->dev, &msi_ctlr_node, rid); if (msi_ctlr_node) *node = of_fwnode_handle(msi_ctlr_node); } else { rid = iort_msi_xlate(&pdev->dev, rid, node); } return rid; } /** * pci_msi_get_device_domain - Get the MSI domain for a given PCI device * @pdev: The PCI device * * Use the firmware data to find a device-specific MSI domain * (i.e. not one that is set as a default). * * Returns: The corresponding MSI domain or NULL if none has been found. */ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) { struct irq_domain *dom; u32 rid = pci_dev_id(pdev); pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); if (!dom) dom = iort_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); return dom; } |
| 1 981 805 174 14 789 1 174 1047 982 986 526 970 17 624 638 616 14 628 227 267 20 642 20 640 19 510 632 204 204 202 203 858 810 839 1022 978 981 175 811 39 1013 1 899 1019 3 108 18 99 13 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_CORE_H #define IOU_CORE_H #include <linux/errno.h> #include <linux/lockdep.h> #include <linux/resume_user_mode.h> #include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "tw.h" #include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #endif struct io_rings_layout { /* size of CQ + headers + SQ offset array */ size_t rings_size; size_t sq_size; size_t sq_array_offset; }; struct io_ctx_config { struct io_uring_params p; struct io_rings_layout layout; struct io_uring_params __user *uptr; }; #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ IORING_FEAT_NODROP |\ IORING_FEAT_SUBMIT_STABLE |\ IORING_FEAT_RW_CUR_POS |\ IORING_FEAT_CUR_PERSONALITY |\ IORING_FEAT_FAST_POLL |\ IORING_FEAT_POLL_32BITS |\ IORING_FEAT_SQPOLL_NONFIXED |\ IORING_FEAT_EXT_ARG |\ IORING_FEAT_NATIVE_WORKERS |\ IORING_FEAT_RSRC_TAGS |\ IORING_FEAT_CQE_SKIP |\ IORING_FEAT_LINKED_FILE |\ IORING_FEAT_REG_REG_RING |\ IORING_FEAT_RECVSEND_BUNDLE |\ IORING_FEAT_MIN_TIMEOUT |\ IORING_FEAT_RW_ATTR |\ IORING_FEAT_NO_IOWAIT) #define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\ IORING_SETUP_SQPOLL |\ IORING_SETUP_SQ_AFF |\ IORING_SETUP_CQSIZE |\ IORING_SETUP_CLAMP |\ IORING_SETUP_ATTACH_WQ |\ IORING_SETUP_R_DISABLED |\ IORING_SETUP_SUBMIT_ALL |\ IORING_SETUP_COOP_TASKRUN |\ IORING_SETUP_TASKRUN_FLAG |\ IORING_SETUP_SQE128 |\ IORING_SETUP_CQE32 |\ IORING_SETUP_SINGLE_ISSUER |\ IORING_SETUP_DEFER_TASKRUN |\ IORING_SETUP_NO_MMAP |\ IORING_SETUP_REGISTERED_FD_ONLY |\ IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ IORING_SETUP_CQE_MIXED |\ IORING_SETUP_SQE_MIXED |\ IORING_SETUP_SQ_REWIND) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ IORING_ENTER_SQ_WAIT |\ IORING_ENTER_EXT_ARG |\ IORING_ENTER_REGISTERED_RING |\ IORING_ENTER_ABS_TIMER |\ IORING_ENTER_EXT_ARG_REG |\ IORING_ENTER_NO_IOWAIT) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\ IOSQE_IO_DRAIN |\ IOSQE_IO_LINK |\ IOSQE_IO_HARDLINK |\ IOSQE_ASYNC |\ IOSQE_BUFFER_SELECT |\ IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) /* * Complaint timeout for io_uring cancelation exits, and for io-wq exit * worker waiting. */ #define IO_URING_EXIT_WAIT_MAX (HZ * 60 * 5) enum { IOU_COMPLETE = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* * The request has more work to do and should be retried. io_uring will * attempt to wait on the file for eligible opcodes, but otherwise * it'll be handed to iowq for blocking execution. It works for normal * requests as well as for the multi shot mode. */ IOU_RETRY = -EAGAIN, /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, }; struct io_defer_entry { struct list_head list; struct io_kiocb *req; }; struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; unsigned cq_min_tail; unsigned nr_timeouts; int hit_timeout; ktime_t min_timeout; ktime_t timeout; struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) int io_prepare_config(struct io_ctx_config *config); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); unsigned io_linked_nr(struct io_kiocb *req); void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); __cold void io_uring_drop_tctx_refs(struct task_struct *task); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx); void __io_submit_flush_completions(struct io_ring_ctx *ctx); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); void io_activate_pollwq(struct io_ring_ctx *ctx); void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { #if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) lockdep_assert_held(&ctx->uring_lock); if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* * ->submitter_task may be NULL and we can still post a CQE, * if the ring has been setup with IORING_SETUP_R_DISABLED. * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ if (!percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current == ctx->submitter_task); } #endif } static inline bool io_is_compat(struct io_ring_ctx *ctx) { return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || ctx->submit_state.cq_flush) __io_submit_flush_completions(ctx); } #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; if (ctx->flags & IORING_SETUP_CQE32) { ctx->cqe_cached++; } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; ctx->cached_cq_tail++; } WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, bool cqe32) { return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **cqe_ret) { io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* * If we can't get a cq entry, userspace overflowed the submission * (by quite a lot). */ if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } if (trace_io_uring_complete_enabled()) trace_io_uring_complete(req->ctx, req, cqe); return true; } static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; if (req->flags & REQ_F_CQE_SKIP) { req->flags &= ~REQ_F_CQE_SKIP; req->flags |= REQ_F_SKIP_LINK_CQES; } } static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; req->cqe.flags = cflags; } static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) { if (ctx->flags & IORING_SETUP_CQE_MIXED) return IORING_CQE_F_32; return 0; } static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, __u64 extra1, __u64 extra2) { req->cqe.res = res; req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); req->big_cqe.extra1 = extra1; req->big_cqe.extra2 = extra2; } static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { if (cache) { req->async_data = io_cache_alloc(cache, GFP_KERNEL); } else { const struct io_issue_def *def = &io_issue_defs[req->opcode]; WARN_ON_ONCE(!def->async_size); req->async_data = kmalloc(def->async_size, GFP_KERNEL); } if (req->async_data) req->flags |= REQ_F_ASYNC_DATA; return req->async_data; } static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; } static inline void io_req_async_data_clear(struct io_kiocb *req, io_req_flags_t extra_flags) { req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags); req->async_data = NULL; } static inline void io_req_async_data_free(struct io_kiocb *req) { kfree(req->async_data); io_req_async_data_clear(req, 0); } static inline void io_put_file(struct io_kiocb *req) { if (!(req->flags & REQ_F_FIXED_FILE) && req->file) fput(req->file); } static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) { lockdep_assert_held(&ctx->uring_lock); if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_unlock(&ctx->uring_lock); } static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) { /* * "Normal" inline submissions always hold the uring_lock, since we * grab it from the system call. Same is true for the SQPOLL offload. * The only exception is when we've detached the request and issue it * from an async worker thread, grab the lock for that case. */ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_lock(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock); } static inline void io_commit_cqring(struct io_ring_ctx *ctx) { /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } static inline void __io_wq_wake(struct wait_queue_head *wq) { /* * * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter * set in the mask so that if we recurse back into our own poll * waitqueue handlers, we know we have a dependency between eventfd or * epoll and should terminate multishot poll at that point. */ if (wq_has_sleeper(wq)) __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { __io_wq_wake(&ctx->poll_wq); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * Trigger waitqueue handler on all waiters on our waitqueue. This * won't necessarily wake up all the tasks, io_should_wake() will make * that decision. */ __io_wq_wake(&ctx->cq_wait); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; /* * SQPOLL must use the actual sqring head, as using the cached_sq_head * is race prone if the SQPOLL thread has grabbed entries but not yet * committed them to the ring. For !SQPOLL, this doesn't matter, but * since this helper is just used for SQPOLL sqring waits (or POLLOUT), * just read the actual sqring head unconditionally. */ return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned int entries; /* make sure SQ entry isn't read before tail */ entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; return min(entries, ctx->sq_entries); } /* * Don't complete immediately but use deferred completion infrastructure. * Protected by ->uring_lock and can only be used either with * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex. */ static inline void io_req_complete_defer(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_submit_state *state = &req->ctx->submit_state; lockdep_assert_held(&req->ctx->uring_lock); wq_list_add_tail(&req->comp_list, &state->compl_reqs); } static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } static inline void io_get_task_refs(int nr) { struct io_uring_task *tctx = current->io_uring; tctx->cached_refs -= nr; if (unlikely(tctx->cached_refs < 0)) io_task_refs_refill(tctx); } static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { return !ctx->submit_state.free_list.next; } extern struct kmem_cache *req_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); wq_stack_extract(&ctx->submit_state.free_list); return req; } static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) { if (unlikely(io_req_cache_empty(ctx))) { if (!__io_alloc_req_refill(ctx)) return false; } *req = io_extract_req(ctx); return true; } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } return false; } static inline ktime_t io_get_time(struct io_ring_ctx *ctx) { if (ctx->clockid == CLOCK_MONOTONIC) return ktime_get(); return ktime_get_with_offset(ctx->clock_offset); } enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || io_local_work_pending(ctx); } #endif |
| 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | // SPDX-License-Identifier: GPL-2.0-only /* * stack_o2cb.c * * Code which interfaces ocfs2 with the o2cb stack. * * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/module.h> /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ #include <linux/fs.h> #include "cluster/masklog.h" #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" #include "cluster/tcp.h" #include "stackglue.h" struct o2dlm_private { struct dlm_eviction_cb op_eviction_cb; }; static struct ocfs2_stack_plugin o2cb_stack; /* These should be identical */ #if (DLM_LOCK_IV != LKM_IVMODE) # error Lock modes do not match #endif #if (DLM_LOCK_NL != LKM_NLMODE) # error Lock modes do not match #endif #if (DLM_LOCK_CR != LKM_CRMODE) # error Lock modes do not match #endif #if (DLM_LOCK_CW != LKM_CWMODE) # error Lock modes do not match #endif #if (DLM_LOCK_PR != LKM_PRMODE) # error Lock modes do not match #endif #if (DLM_LOCK_PW != LKM_PWMODE) # error Lock modes do not match #endif #if (DLM_LOCK_EX != LKM_EXMODE) # error Lock modes do not match #endif static inline int mode_to_o2dlm(int mode) { BUG_ON(mode > LKM_MAXMODE); return mode; } static int flags_to_o2dlm(u32 flags) { int o2dlm_flags = 0; if (flags & DLM_LKF_NOQUEUE) o2dlm_flags |= LKM_NOQUEUE; if (flags & DLM_LKF_CANCEL) o2dlm_flags |= LKM_CANCEL; if (flags & DLM_LKF_CONVERT) o2dlm_flags |= LKM_CONVERT; if (flags & DLM_LKF_VALBLK) o2dlm_flags |= LKM_VALBLK; if (flags & DLM_LKF_IVVALBLK) o2dlm_flags |= LKM_INVVALBLK; if (flags & DLM_LKF_ORPHAN) o2dlm_flags |= LKM_ORPHAN; if (flags & DLM_LKF_FORCEUNLOCK) o2dlm_flags |= LKM_FORCE; if (flags & DLM_LKF_TIMEOUT) o2dlm_flags |= LKM_TIMEOUT; if (flags & DLM_LKF_LOCAL) o2dlm_flags |= LKM_LOCAL; return o2dlm_flags; } /* * Map an o2dlm status to standard errno values. * * o2dlm only uses a handful of these, and returns even fewer to the * caller. Still, we try to assign sane values to each error. * * The following value pairs have special meanings to dlmglue, thus * the right hand side needs to stay unique - never duplicate the * mapping elsewhere in the table! * * DLM_NORMAL: 0 * DLM_NOTQUEUED: -EAGAIN * DLM_CANCELGRANT: -EBUSY * DLM_CANCEL: -DLM_ECANCEL */ /* Keep in sync with dlmapi.h */ static int status_map[] = { [DLM_NORMAL] = 0, /* Success */ [DLM_GRANTED] = -EINVAL, [DLM_DENIED] = -EACCES, [DLM_DENIED_NOLOCKS] = -EACCES, [DLM_WORKING] = -EACCES, [DLM_BLOCKED] = -EINVAL, [DLM_BLOCKED_ORPHAN] = -EINVAL, [DLM_DENIED_GRACE_PERIOD] = -EACCES, [DLM_SYSERR] = -ENOMEM, /* It is what it is */ [DLM_NOSUPPORT] = -EPROTO, [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ [DLM_IVLOCKID] = -EINVAL, [DLM_SYNC] = -EINVAL, [DLM_BADTYPE] = -EINVAL, [DLM_BADRESOURCE] = -EINVAL, [DLM_MAXHANDLES] = -ENOMEM, [DLM_NOCLINFO] = -EINVAL, [DLM_NOLOCKMGR] = -EINVAL, [DLM_NOPURGED] = -EINVAL, [DLM_BADARGS] = -EINVAL, [DLM_VOID] = -EINVAL, [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ [DLM_IVBUFLEN] = -EINVAL, [DLM_CVTUNGRANT] = -EPERM, [DLM_BADPARAM] = -EINVAL, [DLM_VALNOTVALID] = -EINVAL, [DLM_REJECTED] = -EPERM, [DLM_ABORT] = -EINVAL, [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ [DLM_IVRESHANDLE] = -EINVAL, [DLM_DEADLOCK] = -EDEADLK, [DLM_DENIED_NOASTS] = -EINVAL, [DLM_FORWARD] = -EINVAL, [DLM_TIMEOUT] = -ETIMEDOUT, [DLM_IVGROUPID] = -EINVAL, [DLM_VERS_CONFLICT] = -EOPNOTSUPP, [DLM_BAD_DEVICE_PATH] = -ENOENT, [DLM_NO_DEVICE_PERMISSION] = -EPERM, [DLM_NO_CONTROL_DEVICE] = -ENOENT, [DLM_RECOVERING] = -ENOTCONN, [DLM_MIGRATING] = -ERESTART, [DLM_MAXSTATS] = -EINVAL, }; static int dlm_status_to_errno(enum dlm_status status) { BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } static void o2dlm_lock_ast_wrapper(void *astarg) { struct ocfs2_dlm_lksb *lksb = astarg; lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); } static void o2dlm_blocking_ast_wrapper(void *astarg, int level) { struct ocfs2_dlm_lksb *lksb = astarg; lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); } static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) { struct ocfs2_dlm_lksb *lksb = astarg; int error = dlm_status_to_errno(status); /* * In o2dlm, you can get both the lock_ast() for the lock being * granted and the unlock_ast() for the CANCEL failing. A * successful cancel sends DLM_NORMAL here. If the * lock grant happened before the cancel arrived, you get * DLM_CANCELGRANT. * * There's no need for the double-ast. If we see DLM_CANCELGRANT, * we just ignore it. We expect the lock_ast() to handle the * granted lock. */ if (status == DLM_CANCELGRANT) return; lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error); } static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { enum dlm_status status; int o2dlm_mode = mode_to_o2dlm(mode); int o2dlm_flags = flags_to_o2dlm(flags); int ret; status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, o2dlm_flags, name, namelen, o2dlm_lock_ast_wrapper, lksb, o2dlm_blocking_ast_wrapper); ret = dlm_status_to_errno(status); return ret; } static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { enum dlm_status status; int o2dlm_flags = flags_to_o2dlm(flags); int ret; status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb); ret = dlm_status_to_errno(status); return ret; } static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return dlm_status_to_errno(lksb->lksb_o2dlm.status); } /* * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return 1; } static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return (void *)(lksb->lksb_o2dlm.lvb); } static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) { dlm_print_one_lock(lksb->lksb_o2dlm.lockid); } /* * Check if this node is heartbeating and is connected to all other * heartbeating nodes. */ static int o2cb_cluster_check(void) { u8 node_num; int i; unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; node_num = o2nm_this_node(); if (node_num == O2NM_MAX_NODES) { printk(KERN_ERR "o2cb: This node has not been configured.\n"); return -EINVAL; } /* * o2dlm expects o2net sockets to be created. If not, then * dlm_join_domain() fails with a stack of errors which are both cryptic * and incomplete. The idea here is to detect upfront whether we have * managed to connect to all nodes or not. If not, then list the nodes * to allow the user to check the configuration (incorrect IP, firewall, * etc.) Yes, this is racy. But its not the end of the world. */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } printk(KERN_ERR "o2cb: This node could not connect to nodes:"); i = -1; while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { if (!test_bit(i, netmap)) printk(" %u", i); } printk(".\n"); return -ENOTCONN; } /* * Called from the dlm when it's about to evict a node. This is how the * classic stack signals node death. */ static void o2dlm_eviction_cb(int node_num, void *data) { struct ocfs2_cluster_connection *conn = data; printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", node_num, conn->cc_namelen, conn->cc_name); conn->cc_recovery_handler(node_num, conn->cc_recovery_data); } static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) { int rc = 0; u32 dlm_key; struct dlm_ctxt *dlm; struct o2dlm_private *priv; struct dlm_protocol_version fs_version; BUG_ON(conn == NULL); BUG_ON(conn->cc_proto == NULL); /* Ensure cluster stack is up and all nodes are connected */ rc = o2cb_cluster_check(); if (rc) { printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " "before retrying.\n"); goto out; } priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); if (!priv) { rc = -ENOMEM; goto out_free; } /* This just fills the structure in. It is safe to pass conn. */ dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, conn); conn->cc_private = priv; /* used by the dlm code to make message headers unique, each * node in this domain must agree on this. */ dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); fs_version.pv_major = conn->cc_version.pv_major; fs_version.pv_minor = conn->cc_version.pv_minor; dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version); if (IS_ERR(dlm)) { rc = PTR_ERR(dlm); mlog_errno(rc); goto out_free; } conn->cc_version.pv_major = fs_version.pv_major; conn->cc_version.pv_minor = fs_version.pv_minor; conn->cc_lockspace = dlm; dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); out_free: if (rc) kfree(conn->cc_private); out: return rc; } static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) { struct dlm_ctxt *dlm = conn->cc_lockspace; struct o2dlm_private *priv = conn->cc_private; dlm_unregister_eviction_cb(&priv->op_eviction_cb); conn->cc_private = NULL; kfree(priv); dlm_unregister_domain(dlm); conn->cc_lockspace = NULL; return 0; } static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { int node_num; node_num = o2nm_this_node(); if (node_num == O2NM_INVALID_NODE_NUM) return -ENOENT; if (node_num >= O2NM_MAX_NODES) return -EOVERFLOW; *node = node_num; return 0; } static const struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, .this_node = o2cb_cluster_this_node, .dlm_lock = o2cb_dlm_lock, .dlm_unlock = o2cb_dlm_unlock, .lock_status = o2cb_dlm_lock_status, .lvb_valid = o2cb_dlm_lvb_valid, .lock_lvb = o2cb_dlm_lvb, .dump_lksb = o2cb_dump_lksb, }; static struct ocfs2_stack_plugin o2cb_stack = { .sp_name = "o2cb", .sp_ops = &o2cb_stack_ops, .sp_owner = THIS_MODULE, }; static int __init o2cb_stack_init(void) { return ocfs2_stack_glue_register(&o2cb_stack); } static void __exit o2cb_stack_exit(void) { ocfs2_stack_glue_unregister(&o2cb_stack); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); MODULE_LICENSE("GPL"); module_init(o2cb_stack_init); module_exit(o2cb_stack_exit); |
| 1827 931 1132 4 26 4 6 6 6 203 110 137 1132 929 218 1372 294 1508 1511 342 344 344 1399 556 6363 3 3 3 79 3 276 296 174 174 23 298 299 5495 5494 61 299 296 61 5499 5330 300 6108 6123 5515 2103 275 2210 1829 1133 1827 370 298 193 193 10 10 13 3 10 1 3963 142 69 3801 13 6 751 6 753 750 6 6 1037 1031 26 753 340 348 2 338 11 348 344 73 73 48 1087 1090 1089 985 146 644 292 6320 6325 633 850 2215 2196 1088 988 709 34 2221 4499 4496 4515 1823 1818 2 13 13 61 72 50 9 98 113 2 1 222 98 4 122 125 38 116 115 109 1107 1104 371 370 371 371 275 274 274 275 267 267 25 25 25 11 11 11 1 817 81 82 817 818 818 816 816 794 25 82 182 636 118 696 276 274 275 2 275 71 71 6 6 5 6 26 26 26 26 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> */ #include <linux/mm.h> #include <linux/swap.h> #include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; struct bio *free_list_irq; unsigned int nr; unsigned int nr_irq; }; static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; } bvec_slabs[] __read_mostly = { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) { switch (nr_vecs) { /* smaller bios use inline vecs */ case 5 ... 16: return &bvec_slabs[0]; case 17 ... 64: return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); return NULL; } } /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management */ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); static struct bio_slab *create_bio_slab(unsigned int size) { struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); if (!bslab) return NULL; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; bslab->slab_ref = 1; bslab->slab_size = size; if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) return bslab; kmem_cache_destroy(bslab->slab); fail_alloc_slab: kfree(bslab); return NULL; } static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); struct bio_slab *bslab; mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, size); if (bslab) bslab->slab_ref++; else bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); if (bslab) return bslab->slab; return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; WARN_ON_ONCE(bslab->slab != bs->bio_slab); WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; xa_erase(&bio_slabs, slab_size); kmem_cache_destroy(bslab->slab); kfree(bslab); out: mutex_unlock(&bio_slab_lock); } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); if (WARN_ON_ONCE(!bvs)) return NULL; /* * Upgrade the nr_vecs request to take full advantage of the allocation. * We also rely on this in the bvec_free path. */ *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_VECS entries. */ if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio_integrity(bio)) bio_integrity_free(bio); bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p = bio; WARN_ON_ONCE(!bs); bio_uninit(bio); bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } /* * Users of this function have their own bio allocation. Subsequently, * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; #endif bio->bi_vcnt = 0; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset * @bdev: block device to use the bio for * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly * allocated bio returned bio bio_alloc_bioset() - the only fields that are * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { struct bio_vec *bv = bio->bi_io_vec; bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); bio->bi_io_vec = bv; bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); /** * bio_reuse - reuse a bio with the payload left intact * @bio: bio to reuse * @opf: operation and flags for the next I/O * * Allow reusing an existing bio for another operation with all set up * fields including the payload, device and end_io handler left intact. * * Typically used when @bio is first used to read data which is then written * to another location without modification. @bio must not be in-flight and * owned by the caller. Can't be used for cloned bios. * * Note: Can't be used when @bio has integrity or blk-crypto contexts for now. * Feel free to add that support when you need it, though. */ void bio_reuse(struct bio *bio, blk_opf_t opf) { unsigned short vcnt = bio->bi_vcnt, i; bio_end_io_t *end_io = bio->bi_end_io; void *private = bio->bi_private; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_integrity(bio)); WARN_ON_ONCE(bio_has_crypt_ctx(bio)); bio_reset(bio, bio->bi_bdev, opf); for (i = 0; i < vcnt; i++) bio->bi_iter.bi_size += bio->bi_io_vec[i].bv_len; bio->bi_vcnt = vcnt; bio->bi_private = private; bio->bi_end_io = end_io; } EXPORT_SYMBOL_GPL(bio_reuse); static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; } /* * This function should only be used as a flag and must never be called. * If execution reaches here, it indicates a serious programming error. */ static void bio_chain_endio(struct bio *bio) { BUG(); } /** * bio_chain - chain bio completions * @bio: the target bio * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have * completed; the chained bio will also be freed when it completes. * * The caller must not set bi_private or bi_end_io in @bio. */ void bio_chain(struct bio *bio, struct bio *parent) { BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); /** * bio_chain_and_submit - submit a bio after chaining it to another one * @prev: bio to chain and submit * @new: bio to chain to * * If @prev is non-NULL, chain it to @new and submit it. * * Return: @new. */ struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { if (prev) { bio_chain(prev, new); submit_bio(prev); } return new; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); } EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); struct bio *bio; while (1) { spin_lock(&bs->rescue_lock); bio = bio_list_pop(&bs->rescue_list); spin_unlock(&bs->rescue_lock); if (!bio) break; submit_bio_noacct(bio); } } static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; if (WARN_ON_ONCE(!bs->rescue_workqueue)) return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on * there for a stacking driver higher up in the stack, processing it * could require allocating bios from this bio_set, and doing that from * our own rescuer would be bad. * * Since bio lists are singly linked, pop them all instead of trying to * remove from the middle of the list: */ bio_list_init(&punt); bio_list_init(&nopunt); while ((bio = bio_list_pop(¤t->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[0] = nopunt; bio_list_init(&nopunt); while ((bio = bio_list_pop(¤t->bio_list[1]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); spin_unlock(&bs->rescue_lock); queue_work(bs->rescue_workqueue, &bs->rescue_work); } static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) { unsigned long flags; /* cache->free_list must be empty */ if (WARN_ON_ONCE(cache->free_list)) return; local_irq_save(flags); cache->free_list = cache->free_list_irq; cache->free_list_irq = NULL; cache->nr += cache->nr_irq; cache->nr_irq = 0; local_irq_restore(flags); } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) bio_alloc_irq_cache_splice(cache); if (!cache->free_list) { put_cpu(); return NULL; } } bio = cache->free_list; cache->free_list = bio->bi_next; cache->nr--; put_cpu(); if (nr_vecs) bio_init_inline(bio, bdev, nr_vecs, opf); else bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. * * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to * allocate a bio. This is due to the mempool guarantees. To make this work, * callers must never allocate more than 1 bio at a time from the general pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * * Note that when running under submit_bio_noacct() (i.e. any block driver), * bios are not submitted until after you return - see the code in * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under submit_bio_noacct() * would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { opf |= REQ_ALLOC_CACHE; bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, gfp_mask, bs); if (bio) return bio; /* * No cached bio available, bio returned below marked with * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. */ } else opf &= ~REQ_ALLOC_CACHE; /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be * submitted (and thus freed) until after we return. * * This exposes us to a potential deadlock if we allocate multiple bios * from the same bio_set() while running underneath submit_bio_noacct(). * If we were to allocate multiple bios (say a stacking block driver * that was splitting bios), we would deadlock if we exhausted the * mempool's reserve. * * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are bios on * current->bio_list, we first try the allocation without * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be * blocking to the rescuer workqueue before we retry with the original * gfp_flags. */ if (current->bio_list && (!bio_list_empty(¤t->bio_list[0]) || !bio_list_empty(¤t->bio_list[1])) && bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); } if (unlikely(!p)) return NULL; if (!mempool_is_saturated(&bs->bio_pool)) opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; return bio; err_free: mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); /** * bio_kmalloc - kmalloc a bio * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator * * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized * using bio_init() before use. To free a bio returned from this function use * kfree() after calling bio_uninit(). A bio returned from this function can * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size * @bio: the bio to be truncated * @new_size: new size for truncating the bio * * Description: * Truncate the bio to new size of @new_size. If bio_op(bio) is * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; unsigned int done = 0; bool truncated = false; if (new_size >= bio->bi_iter.bi_size) return; if (bio_op(bio) != REQ_OP_READ) goto exit; bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { size_t offset; if (!truncated) offset = new_size - done; else offset = 0; memzero_page(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } done += bv.bv_len; } exit: /* * Don't touch bvec table here and make it really immutable, since * fs bio user has to retrieve all pages via bio_for_each_segment_all * in its .end_bio() callback. * * It is enough to truncate bio by updating .bi_size since we can make * correct bvec with the updated .bi_size for drivers. */ bio->bi_iter.bi_size = new_size; } /** * guard_bio_eod - truncate a BIO to fit the block device * @bio: bio to truncate * * This allows us to do IO even on the odd last sectors of a device, even if the * block size is some multiple of the physical sector size. * * We'll just truncate the bio to the size of the device, and clear the end of * the buffer head manually. Truly out-of-range accesses will turn into actual * I/O errors, this only handles the "we need to be able to do I/O at the final * sector" case. */ void guard_bio_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; /* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */ if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; maxsector -= bio->bi_iter.bi_sector; if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; bio_truncate(bio, maxsector << 9); } static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { unsigned int i = 0; struct bio *bio; while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) break; } return i; } static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { nr -= __bio_alloc_cache_prune(cache, nr); if (!READ_ONCE(cache->free_list)) { bio_alloc_irq_cache_splice(cache); __bio_alloc_cache_prune(cache, nr); } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct bio_set *bs; bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); if (bs->cache) { struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } return 0; } static void bio_alloc_cache_destroy(struct bio_set *bs) { int cpu; if (!bs->cache) return; cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); for_each_possible_cpu(cpu) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); bs->cache = NULL; } static inline void bio_put_percpu_cache(struct bio *bio) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) goto out_free; if (in_task()) { bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; } else { goto out_free; } put_cpu(); return; out_free: put_cpu(); bio_free(bio); } /** * bio_put - release a reference to a bio * @bio: bio to release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } if (bio->bi_opf & REQ_ALLOC_CACHE) bio_put_percpu_cache(bio); else bio_free(bio); } EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio_clone_blkg_association(bio, bio_src); } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp) < 0) return -ENOMEM; return 0; } /** * bio_alloc_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio_src: bio to clone from * @gfp: allocation priority * @bs: bio_set to allocate from * * Allocate a new bio that is a clone of @bio_src. The caller owns the returned * bio, but not the actual data it points to. * * The caller must ensure that the return bio is not freed before @bio_src. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) { struct bio *bio; bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); if (!bio) return NULL; if (__bio_clone(bio, bio_src, gfp) < 0) { bio_put(bio); return NULL; } bio->bi_io_vec = bio_src->bi_io_vec; return bio; } EXPORT_SYMBOL(bio_alloc_clone); /** * bio_init_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio: bio to clone into * @bio_src: bio to clone from * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. * The caller owns the returned bio, but not the actual data it points to. * * The caller must ensure that @bio_src is not freed before @bio. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) { int ret; bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); ret = __bio_clone(bio, bio_src, gfp); if (ret) bio_uninit(bio); return ret; } EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full * @bio: bio to check * @len: length of one segment to be added * * Return true if @bio is full and one segment with @len bytes can't be * added to the bio, otherwise return false */ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return true; return false; } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) return false; } bv->bv_len += len; return true; } /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. * * This is kept around for the integrity metadata, which is still tries * to build the initial bio to the hardware limit and doesn't have proper * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset); } /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add * @len: length of the data to add, may cross pages * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. */ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } EXPORT_SYMBOL_GPL(__bio_add_page); /** * bio_add_virt_nofail - add data in the direct kernel mapping to a bio * @bio: destination bio * @vaddr: data to add * @len: length of the data to add, may cross pages * * Add the data at @vaddr to @bio. The caller must have ensure a segment * is available for the added data. No merging into an existing segment * will be performed. */ void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } EXPORT_SYMBOL_GPL(bio_add_virt_nofail); /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add * @len: vec entry length, may cross pages * @offset: vec entry offset relative to @page, may cross pages * * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return 0; if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return 0; if (bvec_try_merge_page(bv, page, len, offset)) { bio->bi_iter.bi_size += len; return len; } } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; WARN_ON_ONCE(len > BIO_MAX_SIZE); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. * @folio: Folio to add. * @len: How many bytes from the folio to add. * @off: First byte in this folio to add. * * Filesystems that use folios can call this function instead of calling * bio_add_page() for each page in the folio. If @off is bigger than * PAGE_SIZE, this function can create a bio_vec that starts in a page * after the bv_page. BIOs do not support folios that are 4GiB or larger. * * Return: Whether the addition was successful. */ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; if (len > BIO_MAX_SIZE) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); /** * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio and return how many bytes were added. * This may be less than the amount originally asked. Returns 0 if no data * could be added to @bio. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) { unsigned int offset = offset_in_page(vaddr); len = min(len, PAGE_SIZE - offset); if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) return 0; if (op_is_write(bio_op(bio))) flush_kernel_vmap_range(vaddr, len); return len; } EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); /** * bio_add_vmalloc - add a vmalloc region to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio. Return %true on success or %false if * @bio does not have enough space for the payload. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) { do { unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); if (!added) return false; vaddr += added; len -= added; } while (len); return true; } EXPORT_SYMBOL_GPL(bio_add_vmalloc); void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length * for the next iteration. */ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { size_t nbytes = bio->bi_iter.bi_size & len_align_mask; if (!nbytes) return 0; iov_iter_revert(iter, nbytes); bio->bi_iter.bi_size -= nbytes; do { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (nbytes < bv->bv_len) { bv->bv_len -= nbytes; break; } if (bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_page(bv->bv_page); bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); if (!bio->bi_vcnt) return -EFAULT; return 0; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs * to ensure the bvecs and pages stay referenced until the submitted I/O is * completed by a call to ->ki_complete() or returns with an error other than * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { iov_iter_extraction_t flags = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) flags |= ITER_ALLOW_P2PDMA; do { ssize_t ret; ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec, BIO_MAX_SIZE - bio->bi_iter.bi_size, &bio->bi_vcnt, bio->bi_max_vecs, flags); if (ret <= 0) { if (!bio->bi_vcnt) return ret; break; } bio->bi_iter.bi_size += ret; } while (iov_iter_count(iter) && !bio_full(bio, 0)); if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page)) bio->bi_opf |= REQ_NOMERGE; return bio_iov_iter_align_down(bio, iter, len_align_mask); } static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) { struct folio *folio; while (*size > PAGE_SIZE) { folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); if (folio) return folio; *size = rounddown_pow_of_two(*size - 1); } return folio_alloc(gfp, get_order(*size)); } static void bio_free_folios(struct bio *bio) { struct bio_vec *bv; int i; bio_for_each_bvec_all(bv, bio, i) { struct folio *folio = page_folio(bv->bv_page); if (!is_zero_folio(folio)) folio_put(folio); } } static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) { size_t total_len = iov_iter_count(iter); if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EINVAL; if (WARN_ON_ONCE(bio->bi_iter.bi_size)) return -EINVAL; if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs)) return -EINVAL; do { size_t this_len = min(total_len, SZ_1M); struct folio *folio; if (this_len > PAGE_SIZE * 2) this_len = rounddown_pow_of_two(this_len); if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) break; folio = folio_alloc_greedy(GFP_KERNEL, &this_len); if (!folio) break; bio_add_folio_nofail(bio, folio, this_len, 0); if (copy_from_iter(folio_address(folio), this_len, iter) != this_len) { bio_free_folios(bio); return -EFAULT; } total_len -= this_len; } while (total_len && bio->bi_vcnt < bio->bi_max_vecs); if (!bio->bi_iter.bi_size) return -ENOMEM; return 0; } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) { size_t len = min(iov_iter_count(iter), SZ_1M); struct folio *folio; folio = folio_alloc_greedy(GFP_KERNEL, &len); if (!folio) return -ENOMEM; do { ssize_t ret; ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); if (ret <= 0) { if (!bio->bi_vcnt) { folio_put(folio); return ret; } break; } len -= ret; bio->bi_iter.bi_size += ret; } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1); /* * Set the folio directly here. The above loop has already calculated * the correct bi_size, and we use bi_vcnt for the user buffers. That * is safe as bi_vcnt is only used by the submitter and not the actual * I/O path. */ bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); return 0; } /** * bio_iov_iter_bounce - bounce buffer data from an iter into a bio * @bio: bio to send * @iter: iter to read from / write into * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require * consistency. Allocates folios to back the bounce buffer, and for writes * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) { if (op_is_write(bio_op(bio))) return bio_iov_iter_bounce_write(bio, iter); return bio_iov_iter_bounce_read(bio, iter); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) { struct folio *folio = page_folio(bv->bv_page); size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE - bv->bv_offset / PAGE_SIZE + 1; if (mark_dirty) folio_mark_dirty_lock(folio); unpin_user_folio(folio, nr_pages); } static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error, bool mark_dirty) { unsigned int len = bio->bi_io_vec[0].bv_len; if (likely(!is_error)) { void *buf = bvec_virt(&bio->bi_io_vec[0]); struct iov_iter to; iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt, len); /* copying to pinned pages should always work */ WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len); } else { /* No need to mark folios dirty if never copied to them */ mark_dirty = false; } if (bio_flagged(bio, BIO_PAGE_PINNED)) { int i; for (i = 0; i < bio->bi_vcnt; i++) bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty); } folio_put(page_folio(bio->bi_io_vec[0].bv_page)); } /** * bio_iov_iter_unbounce - finish a bounce buffer operation * @bio: completed bio * @is_error: %true if an I/O error occurred and data should not be copied * @mark_dirty: If %true, folios will be marked dirty. * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require * consistency. Called to complete a bio set up by bio_iov_iter_bounce(). * Copies data back for reads, and marks the original folios dirty if * requested and then frees the bounce buffer. */ void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) { if (op_is_write(bio_op(bio))) bio_free_folios(bio); else bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); } static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); } /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. * * WARNING: Unlike to how submit_bio() is usually used, this function does not * result in bio reference to be consumed. The caller must drop the reference * on his own. */ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access * @sector: sector to access * @data: data to read/write * @len: length in byte to read/write * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) * * Performs synchronous I/O to @bdev for @data/@len. @data must be in * the kernel direct mapping and not a vmalloc address. */ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op) { struct bio_vec bv; struct bio bio; int error; if (WARN_ON_ONCE(is_vmalloc_addr(data))) return -EIO; bio_init(&bio, bdev, &bv, 1, op); bio.bi_iter.bi_sector = sector; bio_add_virt_nofail(&bio, data, len); error = submit_bio_wait(&bio); bio_uninit(&bio); return error; } EXPORT_SYMBOL_GPL(bdev_rw_virt); static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); bio_put(bio); } /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ void bio_await_chain(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); } void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { while (src_iter->bi_size && dst_iter->bi_size) { struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); void *src_buf = bvec_kmap_local(&src_bv); void *dst_buf = bvec_kmap_local(&dst_bv); memcpy(dst_buf, src_buf, bytes); kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the * deferred bio dirtying paths. */ /* * bio_set_pages_dirty() will mark all the bio's pages as dirty. */ void bio_set_pages_dirty(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will unpin each page and will run one bio_put() against the * BIO. */ static void bio_dirty_fn(struct work_struct *work); static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); static DEFINE_SPINLOCK(bio_dirty_lock); static struct bio *bio_dirty_list; /* * This runs in process context */ static void bio_dirty_fn(struct work_struct *work) { struct bio *bio, *next; spin_lock_irq(&bio_dirty_lock); next = bio_dirty_list; bio_dirty_list = NULL; spin_unlock_irq(&bio_dirty_lock); while ((bio = next) != NULL) { next = bio->bi_private; bio_release_pages(bio, true); bio_put(bio); } } void bio_check_pages_dirty(struct bio *bio) { struct folio_iter fi; unsigned long flags; bio_for_each_folio_all(fi, bio) { if (!folio_test_dirty(fi.folio)) goto defer; } bio_release_pages(bio, false); bio_put(bio); return; defer: spin_lock_irqsave(&bio_dirty_lock, flags); bio->bi_private = bio_dirty_list; bio_dirty_list = bio; spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { /* * If we're not chaining, then ->__bi_remaining is always 1 and * we always end io on the first invocation. */ if (!bio_flagged(bio, BIO_CHAIN)) return true; BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { bio_clear_flag(bio, BIO_CHAIN); return true; } return false; } /** * bio_endio - end I/O on a bio * @bio: bio * * Description: * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the * last time. **/ void bio_endio(struct bio *bio) { again: if (!bio_remaining_done(bio)) return; if (!bio_integrity_endio(bio)) return; blk_zone_bio_endio(bio); rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that * save/restore bi_end_io) - however, we want to avoid unbounded * recursion and blowing the stack. Tail call optimization would * handle this, but compiling with frame pointers also disables * gcc's sibling call optimization. */ if (bio->bi_end_io == bio_chain_endio) { bio = __bio_chain_endio(bio); goto again; } #ifdef CONFIG_BLK_CGROUP /* * Release cgroup info. We shouldn't have to do this here, but quite * a few callers of bio_init fail to call bio_uninit, so we cover up * for that here at least for now. */ if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); /** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Allocates and returns a new bio which represents @sectors from the start of * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point * to @bio's bi_io_vec. It is the caller's responsibility to ensure that * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { struct bio *split; if (WARN_ON_ONCE(sectors <= 0)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); /* atomic writes cannot be split */ if (bio->bi_opf & REQ_ATOMIC) return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; if (bio_integrity(split)) bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } EXPORT_SYMBOL(bio_split); /** * bio_trim - trim a bio * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors * * This function is typically used for bios that are cloned and submitted * to the underlying device in parts. */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { /* We should never trim an atomic write */ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) return; if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) return; bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size; if (bio_integrity(bio)) bio_integrity_trim(bio); } EXPORT_SYMBOL_GPL(bio_trim); /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } /* * bioset_exit - exit a bioset initialized with bioset_init() * * May be called on a zeroed but uninitialized bioset (i.e. allocated with * kzalloc()). */ void bioset_exit(struct bio_set *bs) { bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; } EXPORT_SYMBOL(bioset_exit); /** * bioset_init - Initialize a bio_set * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS * and %BIOSET_NEED_RESCUER * * Description: * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller * to ask for a number of bytes to be allocated in front of the bio. * Front pad allocation is useful for embedding the bio inside * another structure, to avoid allocating extra data to go with the bio. * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, unsigned int pool_size, unsigned int front_pad, int flags) { bs->front_pad = front_pad; if (flags & BIOSET_NEED_BVECS) bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); else bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; if ((flags & BIOSET_NEED_BVECS) && biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; if (flags & BIOSET_NEED_RESCUER) { bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; } if (flags & BIOSET_PERCPU_CACHE) { bs->cache = alloc_percpu(struct bio_alloc_cache); if (!bs->cache) goto bad; cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); } return 0; bad: bioset_exit(bs); return -ENOMEM; } EXPORT_SYMBOL(bioset_init); static int __init init_bio(void) { int i; BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; bvs->slab = kmem_cache_create(bvs->name, bvs->nr_vecs * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); return 0; } subsys_initcall(init_bio); |
| 4 4 4 4 4 4 4 4 4 4 4 30 8 9 22 10 1 10 5 5 2 2 4 3 4 2 4 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 | // SPDX-License-Identifier: GPL-2.0+ /* * Enhanced Host Controller Interface (EHCI) driver for USB. * * Maintainer: Alan Stern <stern@rowland.harvard.edu> * * Copyright (c) 2000-2004 by David Brownell */ #include <linux/module.h> #include <linux/pci.h> #include <linux/dmapool.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/interrupt.h> #include <linux/usb.h> #include <linux/usb/hcd.h> #include <linux/usb/otg.h> #include <linux/moduleparam.h> #include <linux/dma-mapping.h> #include <linux/debugfs.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <asm/byteorder.h> #include <asm/io.h> #include <asm/irq.h> #include <linux/unaligned.h> #if defined(CONFIG_PPC_PS3) #include <asm/firmware.h> #endif /*-------------------------------------------------------------------------*/ /* * EHCI hc_driver implementation ... experimental, incomplete. * Based on the final 1.0 register interface specification. * * USB 2.0 shows up in upcoming www.pcmcia.org technology. * First was PCMCIA, like ISA; then CardBus, which is PCI. * Next comes "CardBay", using USB 2.0 signals. * * Contains additional contributions by Brad Hards, Rory Bolt, and others. * Special thanks to Intel and VIA for providing host controllers to * test this driver on, and Cypress (including In-System Design) for * providing early devices for those host controllers to talk to! */ #define DRIVER_AUTHOR "David Brownell" #define DRIVER_DESC "USB 2.0 'Enhanced' Host Controller (EHCI) Driver" static const char hcd_name [] = "ehci_hcd"; #undef EHCI_URB_TRACE /* magic numbers that can affect system performance */ #define EHCI_TUNE_CERR 3 /* 0-3 qtd retries; 0 == don't stop */ #define EHCI_TUNE_RL_HS 4 /* nak throttle; see 4.9 */ #define EHCI_TUNE_RL_TT 0 #define EHCI_TUNE_MULT_HS 1 /* 1-3 transactions/uframe; 4.10.3 */ #define EHCI_TUNE_MULT_TT 1 /* * Some drivers think it's safe to schedule isochronous transfers more than * 256 ms into the future (partly as a result of an old bug in the scheduling * code). In an attempt to avoid trouble, we will use a minimum scheduling * length of 512 frames instead of 256. */ #define EHCI_TUNE_FLS 1 /* (medium) 512-frame schedule */ /* Initial IRQ latency: faster than hw default */ static int log2_irq_thresh; // 0 to 6 module_param (log2_irq_thresh, int, S_IRUGO); MODULE_PARM_DESC (log2_irq_thresh, "log2 IRQ latency, 1-64 microframes"); /* initial park setting: slower than hw default */ static unsigned park; module_param (park, uint, S_IRUGO); MODULE_PARM_DESC (park, "park setting; 1-3 back-to-back async packets"); /* for flakey hardware, ignore overcurrent indicators */ static bool ignore_oc; module_param (ignore_oc, bool, S_IRUGO); MODULE_PARM_DESC (ignore_oc, "ignore bogus hardware overcurrent indications"); #define INTR_MASK (STS_IAA | STS_FATAL | STS_PCD | STS_ERR | STS_INT) /*-------------------------------------------------------------------------*/ #include "ehci.h" #include "pci-quirks.h" static void compute_tt_budget(u8 budget_table[EHCI_BANDWIDTH_SIZE], struct ehci_tt *tt); /* * The MosChip MCS9990 controller updates its microframe counter * a little before the frame counter, and occasionally we will read * the invalid intermediate value. Avoid problems by checking the * microframe number (the low-order 3 bits); if they are 0 then * re-read the register to get the correct value. */ static unsigned ehci_moschip_read_frame_index(struct ehci_hcd *ehci) { unsigned uf; uf = ehci_readl(ehci, &ehci->regs->frame_index); if (unlikely((uf & 7) == 0)) uf = ehci_readl(ehci, &ehci->regs->frame_index); return uf; } static inline unsigned ehci_read_frame_index(struct ehci_hcd *ehci) { if (ehci->frame_index_bug) return ehci_moschip_read_frame_index(ehci); return ehci_readl(ehci, &ehci->regs->frame_index); } #include "ehci-dbg.c" /*-------------------------------------------------------------------------*/ /* * ehci_handshake - spin reading hc until handshake completes or fails * @ptr: address of hc register to be read * @mask: bits to look at in result of read * @done: value of those bits when handshake succeeds * @usec: timeout in microseconds * * Returns negative errno, or zero on success * * Success happens when the "mask" bits have the specified value (hardware * handshake done). There are two failure modes: "usec" have passed (major * hardware flakeout), or the register reads as all-ones (hardware removed). * * That last failure should_only happen in cases like physical cardbus eject * before driver shutdown. But it also seems to be caused by bugs in cardbus * bridge shutdown: shutting down the bridge before the devices using it. */ int ehci_handshake(struct ehci_hcd *ehci, void __iomem *ptr, u32 mask, u32 done, int usec) { u32 result; do { result = ehci_readl(ehci, ptr); if (result == ~(u32)0) /* card removed */ return -ENODEV; result &= mask; if (result == done) return 0; udelay (1); usec--; } while (usec > 0); return -ETIMEDOUT; } EXPORT_SYMBOL_GPL(ehci_handshake); /* check TDI/ARC silicon is in host mode */ static int tdi_in_host_mode (struct ehci_hcd *ehci) { u32 tmp; tmp = ehci_readl(ehci, &ehci->regs->usbmode); return (tmp & 3) == USBMODE_CM_HC; } /* * Force HC to halt state from unknown (EHCI spec section 2.3). * Must be called with interrupts enabled and the lock not held. */ static int ehci_halt (struct ehci_hcd *ehci) { u32 temp; spin_lock_irq(&ehci->lock); /* disable any irqs left enabled by previous code */ ehci_writel(ehci, 0, &ehci->regs->intr_enable); if (ehci_is_TDI(ehci) && !tdi_in_host_mode(ehci)) { spin_unlock_irq(&ehci->lock); return 0; } /* * This routine gets called during probe before ehci->command * has been initialized, so we can't rely on its value. */ ehci->command &= ~CMD_RUN; temp = ehci_readl(ehci, &ehci->regs->command); temp &= ~(CMD_RUN | CMD_IAAD); ehci_writel(ehci, temp, &ehci->regs->command); spin_unlock_irq(&ehci->lock); synchronize_irq(ehci_to_hcd(ehci)->irq); return ehci_handshake(ehci, &ehci->regs->status, STS_HALT, STS_HALT, 16 * 125); } /* put TDI/ARC silicon into EHCI mode */ static void tdi_reset (struct ehci_hcd *ehci) { u32 tmp; tmp = ehci_readl(ehci, &ehci->regs->usbmode); tmp |= USBMODE_CM_HC; /* The default byte access to MMR space is LE after * controller reset. Set the required endian mode * for transfer buffers to match the host microprocessor */ if (ehci_big_endian_mmio(ehci)) tmp |= USBMODE_BE; ehci_writel(ehci, tmp, &ehci->regs->usbmode); } /* * Reset a non-running (STS_HALT == 1) controller. * Must be called with interrupts enabled and the lock not held. */ int ehci_reset(struct ehci_hcd *ehci) { int retval; u32 command = ehci_readl(ehci, &ehci->regs->command); /* If the EHCI debug controller is active, special care must be * taken before and after a host controller reset */ if (ehci->debug && !dbgp_reset_prep(ehci_to_hcd(ehci))) ehci->debug = NULL; command |= CMD_RESET; dbg_cmd (ehci, "reset", command); ehci_writel(ehci, command, &ehci->regs->command); ehci->rh_state = EHCI_RH_HALTED; ehci->next_statechange = jiffies; retval = ehci_handshake(ehci, &ehci->regs->command, CMD_RESET, 0, 250 * 1000); if (ehci->has_hostpc) { ehci_writel(ehci, USBMODE_EX_HC | USBMODE_EX_VBPS, &ehci->regs->usbmode_ex); ehci_writel(ehci, TXFIFO_DEFAULT, &ehci->regs->txfill_tuning); } if (retval) return retval; if (ehci_is_TDI(ehci)) tdi_reset (ehci); if (ehci->debug) dbgp_external_startup(ehci_to_hcd(ehci)); ehci->port_c_suspend = ehci->suspended_ports = ehci->resuming_ports = 0; return retval; } EXPORT_SYMBOL_GPL(ehci_reset); /* * Idle the controller (turn off the schedules). * Must be called with interrupts enabled and the lock not held. */ static void ehci_quiesce (struct ehci_hcd *ehci) { u32 temp; if (ehci->rh_state != EHCI_RH_RUNNING) return; /* wait for any schedule enables/disables to take effect */ temp = (ehci->command << 10) & (STS_ASS | STS_PSS); ehci_handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, temp, 16 * 125); /* then disable anything that's still active */ spin_lock_irq(&ehci->lock); ehci->command &= ~(CMD_ASE | CMD_PSE); ehci_writel(ehci, ehci->command, &ehci->regs->command); spin_unlock_irq(&ehci->lock); /* hardware can take 16 microframes to turn off ... */ ehci_handshake(ehci, &ehci->regs->status, STS_ASS | STS_PSS, 0, 16 * 125); } /*-------------------------------------------------------------------------*/ static void end_iaa_cycle(struct ehci_hcd *ehci); static void end_unlink_async(struct ehci_hcd *ehci); static void unlink_empty_async(struct ehci_hcd *ehci); static void ehci_work(struct ehci_hcd *ehci); static void start_unlink_intr(struct ehci_hcd *ehci, struct ehci_qh *qh); static void end_unlink_intr(struct ehci_hcd *ehci, struct ehci_qh *qh); static int ehci_port_power(struct ehci_hcd *ehci, int portnum, bool enable); #include "ehci-timer.c" #include "ehci-hub.c" #include "ehci-mem.c" #include "ehci-q.c" #include "ehci-sched.c" #include "ehci-sysfs.c" /*-------------------------------------------------------------------------*/ /* On some systems, leaving remote wakeup enabled prevents system shutdown. * The firmware seems to think that powering off is a wakeup event! * This routine turns off remote wakeup and everything else, on all ports. */ static void ehci_turn_off_all_ports(struct ehci_hcd *ehci) { int port = HCS_N_PORTS(ehci->hcs_params); while (port--) { spin_unlock_irq(&ehci->lock); ehci_port_power(ehci, port, false); spin_lock_irq(&ehci->lock); ehci_writel(ehci, PORT_RWC_BITS, &ehci->regs->port_status[port]); } } /* * Halt HC, turn off all ports, and let the BIOS use the companion controllers. * Must be called with interrupts enabled and the lock not held. */ static void ehci_silence_controller(struct ehci_hcd *ehci) { ehci_halt(ehci); spin_lock_irq(&ehci->lock); ehci->rh_state = EHCI_RH_HALTED; ehci_turn_off_all_ports(ehci); /* make BIOS/etc use companion controller during reboot */ ehci_writel(ehci, 0, &ehci->regs->configured_flag); /* unblock posted writes */ ehci_readl(ehci, &ehci->regs->configured_flag); spin_unlock_irq(&ehci->lock); } /* ehci_shutdown kick in for silicon on any bus (not just pci, etc). * This forcibly disables dma and IRQs, helping kexec and other cases * where the next system software may expect clean state. */ static void ehci_shutdown(struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); /** * Protect the system from crashing at system shutdown in cases where * usb host is not added yet from OTG controller driver. * As ehci_setup() not done yet, so stop accessing registers or * variables initialized in ehci_setup() */ if (!ehci->sbrn) return; spin_lock_irq(&ehci->lock); ehci->shutdown = true; ehci->rh_state = EHCI_RH_STOPPING; ehci->enabled_hrtimer_events = 0; spin_unlock_irq(&ehci->lock); ehci_silence_controller(ehci); hrtimer_cancel(&ehci->hrtimer); } /*-------------------------------------------------------------------------*/ /* * ehci_work is called from some interrupts, timers, and so on. * it calls driver completion functions, after dropping ehci->lock. */ static void ehci_work (struct ehci_hcd *ehci) { /* another CPU may drop ehci->lock during a schedule scan while * it reports urb completions. this flag guards against bogus * attempts at re-entrant schedule scanning. */ if (ehci->scanning) { ehci->need_rescan = true; return; } ehci->scanning = true; rescan: ehci->need_rescan = false; if (ehci->async_count) scan_async(ehci); if (ehci->intr_count > 0) scan_intr(ehci); if (ehci->isoc_count > 0) scan_isoc(ehci); if (ehci->need_rescan) goto rescan; ehci->scanning = false; /* the IO watchdog guards against hardware or driver bugs that * misplace IRQs, and should let us run completely without IRQs. * such lossage has been observed on both VT6202 and VT8235. */ turn_on_io_watchdog(ehci); } /* * Called when the ehci_hcd module is removed. */ static void ehci_stop (struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); ehci_dbg (ehci, "stop\n"); /* no more interrupts ... */ spin_lock_irq(&ehci->lock); ehci->enabled_hrtimer_events = 0; spin_unlock_irq(&ehci->lock); ehci_quiesce(ehci); ehci_silence_controller(ehci); ehci_reset (ehci); hrtimer_cancel(&ehci->hrtimer); remove_sysfs_files(ehci); remove_debug_files (ehci); /* root hub is shut down separately (first, when possible) */ spin_lock_irq (&ehci->lock); end_free_itds(ehci); spin_unlock_irq (&ehci->lock); ehci_mem_cleanup (ehci); if (ehci->amd_pll_fix == 1) usb_amd_dev_put(); dbg_status (ehci, "ehci_stop completed", ehci_readl(ehci, &ehci->regs->status)); } /* one-time init, only for memory state */ static int ehci_init(struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); u32 temp; int retval; u32 hcc_params; struct ehci_qh_hw *hw; spin_lock_init(&ehci->lock); /* * keep io watchdog by default, those good HCDs could turn off it later */ ehci->need_io_watchdog = 1; hrtimer_setup(&ehci->hrtimer, ehci_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ehci->next_hrtimer_event = EHCI_HRTIMER_NO_EVENT; hcc_params = ehci_readl(ehci, &ehci->caps->hcc_params); /* * by default set standard 80% (== 100 usec/uframe) max periodic * bandwidth as required by USB 2.0 */ ehci->uframe_periodic_max = 100; /* * hw default: 1K periodic list heads, one per frame. * periodic_size can shrink by USBCMD update if hcc_params allows. */ ehci->periodic_size = DEFAULT_I_TDPS; INIT_LIST_HEAD(&ehci->async_unlink); INIT_LIST_HEAD(&ehci->async_idle); INIT_LIST_HEAD(&ehci->intr_unlink_wait); INIT_LIST_HEAD(&ehci->intr_unlink); INIT_LIST_HEAD(&ehci->intr_qh_list); INIT_LIST_HEAD(&ehci->cached_itd_list); INIT_LIST_HEAD(&ehci->cached_sitd_list); INIT_LIST_HEAD(&ehci->tt_list); if (HCC_PGM_FRAMELISTLEN(hcc_params)) { /* periodic schedule size can be smaller than default */ switch (EHCI_TUNE_FLS) { case 0: ehci->periodic_size = 1024; break; case 1: ehci->periodic_size = 512; break; case 2: ehci->periodic_size = 256; break; default: BUG(); } } if ((retval = ehci_mem_init(ehci, GFP_KERNEL)) < 0) return retval; /* controllers may cache some of the periodic schedule ... */ if (HCC_ISOC_CACHE(hcc_params)) // full frame cache ehci->i_thresh = 0; else // N microframes cached ehci->i_thresh = 2 + HCC_ISOC_THRES(hcc_params); /* * dedicate a qh for the async ring head, since we couldn't unlink * a 'real' qh without stopping the async schedule [4.8]. use it * as the 'reclamation list head' too. * its dummy is used in hw_alt_next of many tds, to prevent the qh * from automatically advancing to the next td after short reads. */ ehci->async->qh_next.qh = NULL; hw = ehci->async->hw; hw->hw_next = QH_NEXT(ehci, ehci->async->qh_dma); hw->hw_info1 = cpu_to_hc32(ehci, QH_HEAD); #if defined(CONFIG_PPC_PS3) hw->hw_info1 |= cpu_to_hc32(ehci, QH_INACTIVATE); #endif hw->hw_token = cpu_to_hc32(ehci, QTD_STS_HALT); hw->hw_qtd_next = EHCI_LIST_END(ehci); ehci->async->qh_state = QH_STATE_LINKED; hw->hw_alt_next = QTD_NEXT(ehci, ehci->async->dummy->qtd_dma); /* clear interrupt enables, set irq latency */ if (log2_irq_thresh < 0 || log2_irq_thresh > 6) log2_irq_thresh = 0; temp = 1 << (16 + log2_irq_thresh); if (HCC_PER_PORT_CHANGE_EVENT(hcc_params)) { ehci->has_ppcd = 1; ehci_dbg(ehci, "enable per-port change event\n"); temp |= CMD_PPCEE; } if (HCC_CANPARK(hcc_params)) { /* HW default park == 3, on hardware that supports it (like * NVidia and ALI silicon), maximizes throughput on the async * schedule by avoiding QH fetches between transfers. * * With fast usb storage devices and NForce2, "park" seems to * make problems: throughput reduction (!), data errors... */ if (park) { park = min_t(unsigned int, park, 3); temp |= CMD_PARK; temp |= park << 8; } ehci_dbg(ehci, "park %d\n", park); } if (HCC_PGM_FRAMELISTLEN(hcc_params)) { /* periodic schedule size can be smaller than default */ temp &= ~(3 << 2); temp |= (EHCI_TUNE_FLS << 2); } ehci->command = temp; /* Accept arbitrarily long scatter-gather lists */ if (!hcd->localmem_pool) hcd->self.sg_tablesize = ~0; /* Prepare for unlinking active QHs */ ehci->old_current = ~0; return 0; } /* start HC running; it's halted, ehci_init() has been run (once) */ static int ehci_run (struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); u32 temp; u32 hcc_params; int rc; hcd->uses_new_polling = 1; /* EHCI spec section 4.1 */ ehci_writel(ehci, ehci->periodic_dma, &ehci->regs->frame_list); ehci_writel(ehci, (u32)ehci->async->qh_dma, &ehci->regs->async_next); /* * hcc_params controls whether ehci->regs->segment must (!!!) * be used; it constrains QH/ITD/SITD and QTD locations. * dma_pool consistent memory always uses segment zero. * streaming mappings for I/O buffers, like dma_map_single(), * can return segments above 4GB, if the device allows. * * NOTE: the dma mask is visible through dev->dma_mask, so * drivers can pass this info along ... like NETIF_F_HIGHDMA, * Scsi_Host.highmem_io, and so forth. It's readonly to all * host side drivers though. */ hcc_params = ehci_readl(ehci, &ehci->caps->hcc_params); if (HCC_64BIT_ADDR(hcc_params)) { ehci_writel(ehci, 0, &ehci->regs->segment); #if 0 // this is deeply broken on almost all architectures if (!dma_set_mask(hcd->self.controller, DMA_BIT_MASK(64))) ehci_info(ehci, "enabled 64bit DMA\n"); #endif } // Philips, Intel, and maybe others need CMD_RUN before the // root hub will detect new devices (why?); NEC doesn't ehci->command &= ~(CMD_LRESET|CMD_IAAD|CMD_PSE|CMD_ASE|CMD_RESET); ehci->command |= CMD_RUN; ehci_writel(ehci, ehci->command, &ehci->regs->command); dbg_cmd (ehci, "init", ehci->command); /* * Start, enabling full USB 2.0 functionality ... usb 1.1 devices * are explicitly handed to companion controller(s), so no TT is * involved with the root hub. (Except where one is integrated, * and there's no companion controller unless maybe for USB OTG.) * * Turning on the CF flag will transfer ownership of all ports * from the companions to the EHCI controller. If any of the * companions are in the middle of a port reset at the time, it * could cause trouble. Write-locking ehci_cf_port_reset_rwsem * guarantees that no resets are in progress. After we set CF, * a short delay lets the hardware catch up; new resets shouldn't * be started before the port switching actions could complete. */ down_write(&ehci_cf_port_reset_rwsem); ehci->rh_state = EHCI_RH_RUNNING; ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag); /* Wait until HC become operational */ ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */ msleep(5); /* For Aspeed, STS_HALT also depends on ASS/PSS status. * Check CMD_RUN instead. */ if (ehci->is_aspeed) rc = ehci_handshake(ehci, &ehci->regs->command, CMD_RUN, 1, 100 * 1000); else rc = ehci_handshake(ehci, &ehci->regs->status, STS_HALT, 0, 100 * 1000); up_write(&ehci_cf_port_reset_rwsem); if (rc) { ehci_err(ehci, "USB %x.%x, controller refused to start: %d\n", ((ehci->sbrn & 0xf0)>>4), (ehci->sbrn & 0x0f), rc); return rc; } ehci->last_periodic_enable = ktime_get_real(); temp = HC_VERSION(ehci, ehci_readl(ehci, &ehci->caps->hc_capbase)); ehci_info (ehci, "USB %x.%x started, EHCI %x.%02x%s\n", ((ehci->sbrn & 0xf0)>>4), (ehci->sbrn & 0x0f), temp >> 8, temp & 0xff, (ignore_oc || ehci->spurious_oc) ? ", overcurrent ignored" : ""); ehci_writel(ehci, INTR_MASK, &ehci->regs->intr_enable); /* Turn On Interrupts */ /* GRR this is run-once init(), being done every time the HC starts. * So long as they're part of class devices, we can't do it init() * since the class device isn't created that early. */ create_debug_files(ehci); create_sysfs_files(ehci); return 0; } int ehci_setup(struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); int retval; ehci->regs = (void __iomem *)ehci->caps + HC_LENGTH(ehci, ehci_readl(ehci, &ehci->caps->hc_capbase)); dbg_hcs_params(ehci, "reset"); dbg_hcc_params(ehci, "reset"); /* cache this readonly data; minimize chip reads */ ehci->hcs_params = ehci_readl(ehci, &ehci->caps->hcs_params); ehci->sbrn = HCD_USB2; /* data structure init */ retval = ehci_init(hcd); if (retval) return retval; retval = ehci_halt(ehci); if (retval) { ehci_mem_cleanup(ehci); return retval; } ehci_reset(ehci); return 0; } EXPORT_SYMBOL_GPL(ehci_setup); /*-------------------------------------------------------------------------*/ static irqreturn_t ehci_irq (struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); u32 status, current_status, masked_status, pcd_status = 0; u32 cmd; int bh; spin_lock(&ehci->lock); status = 0; current_status = ehci_readl(ehci, &ehci->regs->status); restart: /* e.g. cardbus physical eject */ if (current_status == ~(u32) 0) { ehci_dbg (ehci, "device removed\n"); goto dead; } status |= current_status; /* * We don't use STS_FLR, but some controllers don't like it to * remain on, so mask it out along with the other status bits. */ masked_status = current_status & (INTR_MASK | STS_FLR); /* Shared IRQ? */ if (!masked_status || unlikely(ehci->rh_state == EHCI_RH_HALTED)) { spin_unlock(&ehci->lock); return IRQ_NONE; } /* clear (just) interrupts */ ehci_writel(ehci, masked_status, &ehci->regs->status); /* For edge interrupts, don't race with an interrupt bit being raised */ current_status = ehci_readl(ehci, &ehci->regs->status); if (current_status & INTR_MASK) goto restart; cmd = ehci_readl(ehci, &ehci->regs->command); bh = 0; /* normal [4.15.1.2] or error [4.15.1.1] completion */ if (likely ((status & (STS_INT|STS_ERR)) != 0)) { if (likely ((status & STS_ERR) == 0)) { INCR(ehci->stats.normal); } else { /* Force to check port status */ if (ehci->has_ci_pec_bug) status |= STS_PCD; INCR(ehci->stats.error); } bh = 1; } /* complete the unlinking of some qh [4.15.2.3] */ if (status & STS_IAA) { /* Turn off the IAA watchdog */ ehci->enabled_hrtimer_events &= ~BIT(EHCI_HRTIMER_IAA_WATCHDOG); /* * Mild optimization: Allow another IAAD to reset the * hrtimer, if one occurs before the next expiration. * In theory we could always cancel the hrtimer, but * tests show that about half the time it will be reset * for some other event anyway. */ if (ehci->next_hrtimer_event == EHCI_HRTIMER_IAA_WATCHDOG) ++ehci->next_hrtimer_event; /* guard against (alleged) silicon errata */ if (cmd & CMD_IAAD) ehci_dbg(ehci, "IAA with IAAD still set?\n"); if (ehci->iaa_in_progress) INCR(ehci->stats.iaa); end_iaa_cycle(ehci); } /* remote wakeup [4.3.1] */ if (status & STS_PCD) { unsigned i = HCS_N_PORTS (ehci->hcs_params); u32 ppcd = ~0; /* kick root hub later */ pcd_status = status; /* resume root hub? */ if (ehci->rh_state == EHCI_RH_SUSPENDED) usb_hcd_resume_root_hub(hcd); /* get per-port change detect bits */ if (ehci->has_ppcd) ppcd = status >> 16; while (i--) { int pstatus; /* leverage per-port change bits feature */ if (!(ppcd & (1 << i))) continue; pstatus = ehci_readl(ehci, &ehci->regs->port_status[i]); if (pstatus & PORT_OWNER) continue; if (!(test_bit(i, &ehci->suspended_ports) && ((pstatus & PORT_RESUME) || !(pstatus & PORT_SUSPEND)) && (pstatus & PORT_PE) && ehci->reset_done[i] == 0)) continue; /* start USB_RESUME_TIMEOUT msec resume signaling from * this port, and make hub_wq collect * PORT_STAT_C_SUSPEND to stop that signaling. */ ehci->reset_done[i] = jiffies + msecs_to_jiffies(USB_RESUME_TIMEOUT); set_bit(i, &ehci->resuming_ports); ehci_dbg (ehci, "port %d remote wakeup\n", i + 1); usb_hcd_start_port_resume(&hcd->self, i); mod_timer(&hcd->rh_timer, ehci->reset_done[i]); } } /* PCI errors [4.15.2.4] */ if (unlikely ((status & STS_FATAL) != 0)) { ehci_err(ehci, "fatal error\n"); dbg_cmd(ehci, "fatal", cmd); dbg_status(ehci, "fatal", status); dead: usb_hc_died(hcd); /* Don't let the controller do anything more */ ehci->shutdown = true; ehci->rh_state = EHCI_RH_STOPPING; ehci->command &= ~(CMD_RUN | CMD_ASE | CMD_PSE); ehci_writel(ehci, ehci->command, &ehci->regs->command); ehci_writel(ehci, 0, &ehci->regs->intr_enable); ehci_handle_controller_death(ehci); /* Handle completions when the controller stops */ bh = 0; } if (bh) ehci_work (ehci); spin_unlock(&ehci->lock); if (pcd_status) usb_hcd_poll_rh_status(hcd); return IRQ_HANDLED; } /*-------------------------------------------------------------------------*/ /* * non-error returns are a promise to giveback() the urb later * we drop ownership so next owner (or urb unlink) can get it * * urb + dev is in hcd.self.controller.urb_list * we're queueing TDs onto software and hardware lists * * hcd-specific init for hcpriv hasn't been done yet * * NOTE: control, bulk, and interrupt share the same code to append TDs * to a (possibly active) QH, and the same QH scanning code. */ static int ehci_urb_enqueue ( struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags ) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); struct list_head qtd_list; INIT_LIST_HEAD (&qtd_list); switch (usb_pipetype (urb->pipe)) { case PIPE_CONTROL: /* qh_completions() code doesn't handle all the fault cases * in multi-TD control transfers. Even 1KB is rare anyway. */ if (urb->transfer_buffer_length > (16 * 1024)) return -EMSGSIZE; fallthrough; /* case PIPE_BULK: */ default: if (!qh_urb_transaction (ehci, urb, &qtd_list, mem_flags)) return -ENOMEM; return submit_async(ehci, urb, &qtd_list, mem_flags); case PIPE_INTERRUPT: if (!qh_urb_transaction (ehci, urb, &qtd_list, mem_flags)) return -ENOMEM; return intr_submit(ehci, urb, &qtd_list, mem_flags); case PIPE_ISOCHRONOUS: if (urb->dev->speed == USB_SPEED_HIGH) return itd_submit (ehci, urb, mem_flags); else return sitd_submit (ehci, urb, mem_flags); } } /* remove from hardware lists * completions normally happen asynchronously */ static int ehci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); struct ehci_qh *qh; unsigned long flags; int rc; spin_lock_irqsave (&ehci->lock, flags); rc = usb_hcd_check_unlink_urb(hcd, urb, status); if (rc) goto done; if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) { /* * We don't expedite dequeue for isochronous URBs. * Just wait until they complete normally or their * time slot expires. */ } else { qh = (struct ehci_qh *) urb->hcpriv; qh->unlink_reason |= QH_UNLINK_REQUESTED; switch (qh->qh_state) { case QH_STATE_LINKED: if (usb_pipetype(urb->pipe) == PIPE_INTERRUPT) start_unlink_intr(ehci, qh); else start_unlink_async(ehci, qh); break; case QH_STATE_COMPLETING: qh->dequeue_during_giveback = 1; break; case QH_STATE_UNLINK: case QH_STATE_UNLINK_WAIT: /* already started */ break; case QH_STATE_IDLE: /* QH might be waiting for a Clear-TT-Buffer */ qh_completions(ehci, qh); break; } } done: spin_unlock_irqrestore (&ehci->lock, flags); return rc; } /*-------------------------------------------------------------------------*/ // bulk qh holds the data toggle static void ehci_endpoint_disable (struct usb_hcd *hcd, struct usb_host_endpoint *ep) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); unsigned long flags; struct ehci_qh *qh; /* ASSERT: any requests/urbs are being unlinked */ /* ASSERT: nobody can be submitting urbs for this any more */ rescan: spin_lock_irqsave (&ehci->lock, flags); qh = ep->hcpriv; if (!qh) goto done; /* endpoints can be iso streams. for now, we don't * accelerate iso completions ... so spin a while. */ if (qh->hw == NULL) { struct ehci_iso_stream *stream = ep->hcpriv; if (!list_empty(&stream->td_list)) goto idle_timeout; /* BUG_ON(!list_empty(&stream->free_list)); */ reserve_release_iso_bandwidth(ehci, stream, -1); kfree(stream); goto done; } qh->unlink_reason |= QH_UNLINK_REQUESTED; switch (qh->qh_state) { case QH_STATE_LINKED: if (list_empty(&qh->qtd_list)) qh->unlink_reason |= QH_UNLINK_QUEUE_EMPTY; else WARN_ON(1); if (usb_endpoint_type(&ep->desc) != USB_ENDPOINT_XFER_INT) start_unlink_async(ehci, qh); else start_unlink_intr(ehci, qh); fallthrough; case QH_STATE_COMPLETING: /* already in unlinking */ case QH_STATE_UNLINK: /* wait for hw to finish? */ case QH_STATE_UNLINK_WAIT: idle_timeout: spin_unlock_irqrestore (&ehci->lock, flags); schedule_timeout_uninterruptible(1); goto rescan; case QH_STATE_IDLE: /* fully unlinked */ if (qh->clearing_tt) goto idle_timeout; if (list_empty (&qh->qtd_list)) { if (qh->ps.bw_uperiod) reserve_release_intr_bandwidth(ehci, qh, -1); qh_destroy(ehci, qh); break; } fallthrough; default: /* caller was supposed to have unlinked any requests; * that's not our job. just leak this memory. */ ehci_err (ehci, "qh %p (#%02x) state %d%s\n", qh, ep->desc.bEndpointAddress, qh->qh_state, list_empty (&qh->qtd_list) ? "" : "(has tds)"); break; } done: ep->hcpriv = NULL; spin_unlock_irqrestore (&ehci->lock, flags); } static void ehci_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); struct ehci_qh *qh; int eptype = usb_endpoint_type(&ep->desc); int epnum = usb_endpoint_num(&ep->desc); int is_out = usb_endpoint_dir_out(&ep->desc); unsigned long flags; if (eptype != USB_ENDPOINT_XFER_BULK && eptype != USB_ENDPOINT_XFER_INT) return; spin_lock_irqsave(&ehci->lock, flags); qh = ep->hcpriv; /* For Bulk and Interrupt endpoints we maintain the toggle state * in the hardware; the toggle bits in udev aren't used at all. * When an endpoint is reset by usb_clear_halt() we must reset * the toggle bit in the QH. */ if (qh) { if (!list_empty(&qh->qtd_list)) { WARN_ONCE(1, "clear_halt for a busy endpoint\n"); } else { /* The toggle value in the QH can't be updated * while the QH is active. Unlink it now; * re-linking will call qh_refresh(). */ usb_settoggle(qh->ps.udev, epnum, is_out, 0); qh->unlink_reason |= QH_UNLINK_REQUESTED; if (eptype == USB_ENDPOINT_XFER_BULK) start_unlink_async(ehci, qh); else start_unlink_intr(ehci, qh); } } spin_unlock_irqrestore(&ehci->lock, flags); } static int ehci_get_frame (struct usb_hcd *hcd) { struct ehci_hcd *ehci = hcd_to_ehci (hcd); return (ehci_read_frame_index(ehci) >> 3) % ehci->periodic_size; } /*-------------------------------------------------------------------------*/ /* Device addition and removal */ static void ehci_remove_device(struct usb_hcd *hcd, struct usb_device *udev) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); spin_lock_irq(&ehci->lock); drop_tt(udev); spin_unlock_irq(&ehci->lock); } /*-------------------------------------------------------------------------*/ #ifdef CONFIG_PM /* Clear wakeup signal locked in zhaoxin platform when device plug in. */ static void ehci_zx_wakeup_clear(struct ehci_hcd *ehci) { u32 __iomem *reg = &ehci->regs->port_status[4]; u32 t1 = ehci_readl(ehci, reg); t1 &= (u32)~0xf0000; t1 |= PORT_TEST_FORCE; ehci_writel(ehci, t1, reg); t1 = ehci_readl(ehci, reg); msleep(1); t1 &= (u32)~0xf0000; ehci_writel(ehci, t1, reg); ehci_readl(ehci, reg); msleep(1); t1 = ehci_readl(ehci, reg); ehci_writel(ehci, t1 | PORT_CSC, reg); ehci_readl(ehci, reg); } /* suspend/resume, section 4.3 */ /* These routines handle the generic parts of controller suspend/resume */ int ehci_suspend(struct usb_hcd *hcd, bool do_wakeup) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); if (time_before(jiffies, ehci->next_statechange)) msleep(10); /* * Root hub was already suspended. Disable IRQ emission and * mark HW unaccessible. The PM and USB cores make sure that * the root hub is either suspended or stopped. */ ehci_prepare_ports_for_controller_suspend(ehci, do_wakeup); spin_lock_irq(&ehci->lock); ehci_writel(ehci, 0, &ehci->regs->intr_enable); (void) ehci_readl(ehci, &ehci->regs->intr_enable); clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); spin_unlock_irq(&ehci->lock); synchronize_irq(hcd->irq); /* Check for race with a wakeup request */ if (do_wakeup && HCD_WAKEUP_PENDING(hcd)) { ehci_resume(hcd, false); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(ehci_suspend); /* Returns 0 if power was preserved, 1 if power was lost */ int ehci_resume(struct usb_hcd *hcd, bool force_reset) { struct ehci_hcd *ehci = hcd_to_ehci(hcd); if (time_before(jiffies, ehci->next_statechange)) msleep(100); /* Mark hardware accessible again as we are back to full power by now */ set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); if (ehci->shutdown) return 0; /* Controller is dead */ if (ehci->zx_wakeup_clear_needed) ehci_zx_wakeup_clear(ehci); /* * If CF is still set and reset isn't forced * then we maintained suspend power. * Just undo the effect of ehci_suspend(). */ if (ehci_readl(ehci, &ehci->regs->configured_flag) == FLAG_CF && !force_reset) { int mask = INTR_MASK; ehci_prepare_ports_for_controller_resume(ehci); spin_lock_irq(&ehci->lock); if (ehci->shutdown) goto skip; if (!hcd->self.root_hub->do_remote_wakeup) mask &= ~STS_PCD; ehci_writel(ehci, mask, &ehci->regs->intr_enable); ehci_readl(ehci, &ehci->regs->intr_enable); skip: spin_unlock_irq(&ehci->lock); return 0; } /* * Else reset, to cope with power loss or resume from hibernation * having let the firmware kick in during reboot. */ usb_root_hub_lost_power(hcd->self.root_hub); (void) ehci_halt(ehci); (void) ehci_reset(ehci); spin_lock_irq(&ehci->lock); if (ehci->shutdown) goto skip; ehci_writel(ehci, ehci->command, &ehci->regs->command); ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag); ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */ ehci->rh_state = EHCI_RH_SUSPENDED; spin_unlock_irq(&ehci->lock); return 1; } EXPORT_SYMBOL_GPL(ehci_resume); #endif /*-------------------------------------------------------------------------*/ /* * Generic structure: This gets copied for platform drivers so that * individual entries can be overridden as needed. */ static const struct hc_driver ehci_hc_driver = { .description = hcd_name, .product_desc = "EHCI Host Controller", .hcd_priv_size = sizeof(struct ehci_hcd), /* * generic hardware linkage */ .irq = ehci_irq, .flags = HCD_MEMORY | HCD_DMA | HCD_USB2 | HCD_BH, /* * basic lifecycle operations */ .reset = ehci_setup, .start = ehci_run, .stop = ehci_stop, .shutdown = ehci_shutdown, /* * managing i/o requests and associated device resources */ .urb_enqueue = ehci_urb_enqueue, .urb_dequeue = ehci_urb_dequeue, .endpoint_disable = ehci_endpoint_disable, .endpoint_reset = ehci_endpoint_reset, .clear_tt_buffer_complete = ehci_clear_tt_buffer_complete, /* * scheduling support */ .get_frame_number = ehci_get_frame, /* * root hub support */ .hub_status_data = ehci_hub_status_data, .hub_control = ehci_hub_control, .bus_suspend = ehci_bus_suspend, .bus_resume = ehci_bus_resume, .relinquish_port = ehci_relinquish_port, .port_handed_over = ehci_port_handed_over, .get_resuming_ports = ehci_get_resuming_ports, /* * device support */ .free_dev = ehci_remove_device, #ifdef CONFIG_USB_HCD_TEST_MODE /* EH SINGLE_STEP_SET_FEATURE test support */ .submit_single_step_set_feature = ehci_submit_single_step_set_feature, #endif }; void ehci_init_driver(struct hc_driver *drv, const struct ehci_driver_overrides *over) { /* Copy the generic table to drv and then apply the overrides */ *drv = ehci_hc_driver; if (over) { drv->hcd_priv_size += over->extra_priv_size; if (over->reset) drv->reset = over->reset; if (over->port_power) drv->port_power = over->port_power; } } EXPORT_SYMBOL_GPL(ehci_init_driver); /*-------------------------------------------------------------------------*/ MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR (DRIVER_AUTHOR); MODULE_LICENSE ("GPL"); #ifdef CONFIG_USB_EHCI_SH #include "ehci-sh.c" #endif #ifdef CONFIG_PPC_PS3 #include "ehci-ps3.c" #endif #ifdef CONFIG_USB_EHCI_HCD_PPC_OF #include "ehci-ppc-of.c" #endif #ifdef CONFIG_XPS_USB_HCD_XILINX #include "ehci-xilinx-of.c" #endif #ifdef CONFIG_SPARC_LEON #include "ehci-grlib.c" #endif static struct platform_driver * const platform_drivers[] = { #ifdef CONFIG_USB_EHCI_SH &ehci_hcd_sh_driver, #endif #ifdef CONFIG_USB_EHCI_HCD_PPC_OF &ehci_hcd_ppc_of_driver, #endif #ifdef CONFIG_XPS_USB_HCD_XILINX &ehci_hcd_xilinx_of_driver, #endif #ifdef CONFIG_SPARC_LEON &ehci_grlib_driver, #endif }; static int __init ehci_hcd_init(void) { int retval = 0; if (usb_disabled()) return -ENODEV; pr_debug("%s: block sizes: qh %zd qtd %zd itd %zd sitd %zd\n", hcd_name, sizeof(struct ehci_qh), sizeof(struct ehci_qtd), sizeof(struct ehci_itd), sizeof(struct ehci_sitd)); #ifdef CONFIG_DYNAMIC_DEBUG ehci_debug_root = debugfs_create_dir("ehci", usb_debug_root); #endif retval = platform_register_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); if (retval < 0) goto clean0; #ifdef CONFIG_PPC_PS3 retval = ps3_ehci_driver_register(&ps3_ehci_driver); if (retval < 0) goto clean1; #endif return 0; #ifdef CONFIG_PPC_PS3 clean1: #endif platform_unregister_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); clean0: #ifdef CONFIG_DYNAMIC_DEBUG debugfs_remove(ehci_debug_root); ehci_debug_root = NULL; #endif return retval; } module_init(ehci_hcd_init); static void __exit ehci_hcd_cleanup(void) { #ifdef CONFIG_PPC_PS3 ps3_ehci_driver_unregister(&ps3_ehci_driver); #endif platform_unregister_drivers(platform_drivers, ARRAY_SIZE(platform_drivers)); #ifdef CONFIG_DYNAMIC_DEBUG debugfs_remove(ehci_debug_root); #endif } module_exit(ehci_hcd_cleanup); |
| 703 704 704 565 601 723 22 721 22 22 106 10 10 105 100 7 100 100 760 319 701 163 15 64 32 12 306 697 724 299 677 55 459 665 29 29 16 538 537 538 539 49 50 49 1 1 6 6 6 6 6 572 566 22 15 21 5 9 7 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" /* * Check to see if a buffer matching the given parameters is already * a part of the given transaction. */ STATIC struct xfs_buf * xfs_trans_buf_item_match( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps) { struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; for (i = 0; i < nmaps; i++) len += map[i].bm_len; list_for_each_entry(lip, &tp->t_items, li_trans) { blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; } } return NULL; } /* * Add the locked buffer to the transaction. * * The buffer must be locked, and it cannot be associated with any * transaction. * * If the buffer does not yet have a buf log item associated with it, * then allocate one for it. Then add the buf item to the transaction. */ STATIC void _xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp, int reset_recur) { struct xfs_buf_log_item *bip; ASSERT(bp->b_transp == NULL); /* * The xfs_buf_log_item pointer is stored in b_log_item. If * it doesn't have one yet, then allocate one and initialize it. * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_log_item; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; /* * Take a reference for this transaction on the buf item. */ atomic_inc(&bip->bli_refcount); /* * Attach the item to the transaction so we can find it in * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); bp->b_transp = tp; } void xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp) { _xfs_trans_bjoin(tp, bp, 0); trace_xfs_trans_bjoin(bp->b_log_item); } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it is already locked * within the transaction, just increment its lock recursion count * and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf_log_item *bip; int error; *bpp = NULL; if (!tp) return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); if (xfs_is_shutdown(tp->t_mountp)) { xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; } ASSERT(bp->b_transp == tp); bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); *bpp = bp; return 0; } error = xfs_buf_get_map(target, map, nmaps, flags, &bp); if (error) return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); *bpp = bp; return 0; } /* * Get and lock the superblock buffer for the given transaction. */ static struct xfs_buf * __xfs_trans_getsb( struct xfs_trans *tp, struct xfs_buf *bp) { /* * Just increment the lock recursion count if the buffer is already * attached to this transaction. */ if (bp->b_transp == tp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_getsb_recur(bip); } else { xfs_buf_lock(bp); xfs_buf_hold(bp); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_getsb(bp->b_log_item); } return bp; } struct xfs_buf * xfs_trans_getsb( struct xfs_trans *tp) { return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp); } struct xfs_buf * xfs_trans_getrtsb( struct xfs_trans *tp) { if (!tp->t_mountp->m_rtsb_bp) return NULL; return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp); } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been * read in, read it from disk. If it is already locked * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ int xfs_trans_read_buf_map( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp = NULL; struct xfs_buf_log_item *bip; int error; *bpp = NULL; /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. If it is already read in we just increment * the lock recursion count and return the buffer to the caller. * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ if (tp) bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_log_item != NULL); ASSERT(!bp->b_error); ASSERT(bp->b_flags & XBF_DONE); /* * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ if (xfs_is_shutdown(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } /* * Check if the caller is trying to read a buffer that is * already attached to the transaction yet has no buffer ops * assigned. Ops are usually attached when the buffer is * attached to the transaction, or by the read caller if * special circumstances. That didn't happen, which is not * how this is supposed to go. * * If the buffer passes verification we'll let this go, but if * not we have to shut down. Let the transaction cleanup code * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; return error; } bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, __return_address); switch (error) { case 0: break; default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); fallthrough; case -ENOMEM: case -EAGAIN: return error; } if (xfs_is_shutdown(mp)) { xfs_buf_relse(bp); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } if (tp) { _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } /* Has this buffer been dirtied by anyone? */ bool xfs_trans_buf_is_dirty( struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!bip) return false; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * Release a buffer previously joined to the transaction. If the buffer is * modified within this transaction, decrement the recursion count but do not * release the buffer even if the count goes to 0. If the buffer is not modified * within the transaction, decrement the recursion count and release the buffer * if the recursion count goes to 0. * * If the buffer is to be released and it was not already dirty before this * transaction began, then also free the buf_log_item associated with it. * * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); if (!tp) { xfs_buf_relse(bp); return; } trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If the release is for a recursive lookup, then decrement the count * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; return; } /* * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; if (bip->bli_flags & XFS_BLI_STALE) return; /* * Unlink the log item from the transaction and clear the hold flag, if * set. We wouldn't want the next user of the buffer to get confused. */ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); bip->bli_flags &= ~XFS_BLI_HOLD; /* drop the reference to the bli */ xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); } /* * Forcibly detach a buffer previously joined to the transaction. The caller * will retain its locked reference to the buffer after this function returns. * The buffer must be completely clean and must not be held to the transaction. */ void xfs_trans_bdetach( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(tp != NULL); ASSERT(bp->b_transp == tp); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_bdetach(bip); /* * Erase all recursion count, since we're removing this buffer from the * transaction. */ bip->bli_recur = 0; /* * The buffer must be completely clean. Specifically, it had better * not be dirty, stale, logged, ordered, or held to the transaction. */ ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY)); ASSERT(!(bip->bli_flags & XFS_BLI_HOLD)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); /* Unlink the log item from the transaction and drop the log item. */ xfs_trans_del_item(&bip->bli_item); xfs_buf_item_put(bip); bp->b_transp = NULL; } /* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ void xfs_trans_bhold( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); } /* * Cancel the previous buffer hold request made on this buffer * for this transaction. */ void xfs_trans_bhold_release( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; trace_xfs_trans_bhold_release(bip); } /* * Mark a buffer dirty in the transaction. */ void xfs_trans_dirty_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer * again. There are no races with the code in xfs_buf_item_unpin(), * because we have a reference to the buffer this entire time. */ if (bip->bli_flags & XFS_BLI_STALE) { bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(bp->b_flags & XBF_STALE); bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * This is called to mark bytes first through last inclusive of the given * buffer as needing to be logged when the transaction is committed. * The buffer must already be associated with the given transaction. * * First and last are numbers relative to the beginning of this buffer, * so the first byte in the buffer is numbered 0 regardless of the * value of b_blkno. */ void xfs_trans_log_buf( struct xfs_trans *tp, struct xfs_buf *bp, uint first, uint last) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); xfs_trans_dirty_buf(tp, bp); trace_xfs_trans_log_buf(bip); xfs_buf_item_log(bip, first, last); } /* * Invalidate a buffer that is being used within a transaction. * * Typically this is because the blocks in the buffer are being freed, so we * need to prevent it from being written out when we're done. Allowing it * to be written again might overwrite data in the free blocks if they are * reallocated to a file. * * We prevent the buffer from being written out by marking it stale. We can't * get rid of the buf log item at this point because the buffer may still be * pinned by another transaction. If that is the case, then we'll wait until * the buffer is committed to disk for the last time (we can tell by the ref * count) and free it in xfs_buf_item_unpin(). Until that happens we will * keep the buffer locked so that the buffer and buf log item are not reused. * * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log * the buf item. This will be used at recovery time to determine that copies * of the buffer in the log before this should not be replayed. * * We mark the item descriptor and the transaction dirty so that we'll hold * the buffer until after the commit. * * Since we're invalidating the buffer, we also clear the state about which * parts of the buffer have been logged. We also clear the flag indicating * that this is an inode buffer since the data in the buffer will no longer * be valid. * * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); if (bip->bli_flags & XFS_BLI_STALE) { /* * If the buffer is already invalidated, then * just return. */ ASSERT(bp->b_flags & XBF_STALE); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } /* * This call is used to indicate that the buffer contains on-disk inodes which * must be handled specially during recovery. They require special handling * because only the di_next_unlinked from the inodes in the buffer should be * recovered. The rest of the data in the buffer is logged via the inodes * themselves. * * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be * transferred to the buffer's log format structure so that we'll know what to * do at recovery time. */ void xfs_trans_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be * prevented as the buffer may have been reused. */ void xfs_trans_stale_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is * relogged as an 'inode buf' we still recover all of the inode * images in the face of a crash. This works in coordination with * xfs_buf_item_committed() to ensure that the buffer remains in the * AIL at its original location even after it has been relogged. */ /* ARGSUSED */ void xfs_trans_inode_alloc_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as ordered for this transaction. This means that the contents * of the buffer are not recorded in the transaction but it is tracked in the * AIL as though it was. This allows us to record logical changes in * transactions rather than the physical changes we make to the buffer without * changing writeback ordering constraints of metadata buffers. */ bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); if (xfs_buf_item_dirty_format(bip)) return false; bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); /* * We don't log a dirty range of an ordered buffer but it still needs * to be marked dirty and that it has been logged. */ xfs_trans_dirty_buf(tp, bp); return true; } /* * Set the type of the buffer for log recovery so that it can correctly identify * and hence attach the correct buffer ops to the buffer after replay. */ void xfs_trans_buf_set_type( struct xfs_trans *tp, struct xfs_buf *bp, enum xfs_blft type) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!tp) return; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); xfs_blft_to_flags(&bip->__bli_format, type); } void xfs_trans_buf_copy_type( struct xfs_buf *dst_bp, struct xfs_buf *src_bp) { struct xfs_buf_log_item *sbip = src_bp->b_log_item; struct xfs_buf_log_item *dbip = dst_bp->b_log_item; enum xfs_blft type; type = xfs_blft_from_flags(&sbip->__bli_format); xfs_blft_to_flags(&dbip->__bli_format, type); } /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of * dquots. However, unlike in inode buffer recovery, dquot buffers get * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). * The only thing that makes dquot buffers different from regular * buffers is that we must not replay dquot bufs when recovering * if a _corresponding_ quotaoff has happened. We also have to distinguish * between usr dquot bufs and grp dquot bufs, because usr and grp quotas * can be turned off independently. */ /* ARGSUSED */ void xfs_trans_dquot_buf( xfs_trans_t *tp, struct xfs_buf *bp, uint type) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); bip->__bli_format.blf_flags |= type; switch (type) { case XFS_BLF_UDQUOT_BUF: type = XFS_BLFT_UDQUOT_BUF; break; case XFS_BLF_PDQUOT_BUF: type = XFS_BLFT_PDQUOT_BUF; break; case XFS_BLF_GDQUOT_BUF: type = XFS_BLFT_GDQUOT_BUF; break; default: type = XFS_BLFT_UNKNOWN_BUF; break; } bp->b_iodone = xfs_buf_dquot_iodone; xfs_trans_buf_set_type(tp, bp, type); } |
| 52 1 50 1 2 28 28 2 1 1 23 3 1 7 1 1 1 3 1 1 4 1 19 19 17 1 4 2 2 1 9 1 1 7 1 1 1 3 1 1 4 1 8 8 5 1 4 2 6 1 1 12 1 1 14 59 2 1 27 14 16 5 1 3 1 3 2 1 1 3 1 3 3 4 1 1 4 1 3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel CIPSO/IPv4 Support * * This file defines the CIPSO/IPv4 functions for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_cipso_v4.h" #include "netlabel_mgmt.h" #include "netlabel_domainhash.h" /* Argument struct for cipso_v4_doi_walk() */ struct netlbl_cipsov4_doiwalk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { struct netlbl_audit *audit_info; u32 doi; }; /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_cipsov4_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = { [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 }, [NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED }, }; /* * Helper Functions */ /** * netlbl_cipsov4_add_common - Parse the common sections of a ADD message * @info: the Generic NETLINK info block * @doi_def: the CIPSO V4 DOI definition * * Description: * Parse the common sections of a ADD message and fill in the related values * in @doi_def. Returns zero on success, negative values on failure. * */ static int netlbl_cipsov4_add_common(struct genl_info *info, struct cipso_v4_doi *doi_def) { struct nlattr *nla; int nla_rem; u32 iter = 0; doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_TAGLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem) if (nla_type(nla) == NLBL_CIPSOV4_A_TAG) { if (iter >= CIPSO_V4_TAG_MAXCNT) return -EINVAL; doi_def->tags[iter++] = nla_get_u8(nla); } while (iter < CIPSO_V4_TAG_MAXCNT) doi_def->tags[iter++] = CIPSO_V4_TAG_INVALID; return 0; } /* * NetLabel Command Handlers */ /** * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD * message and add it to the CIPSO V4 engine. Return zero on success and * non-zero on error. * */ static int netlbl_cipsov4_add_std(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; struct cipso_v4_doi *doi_def = NULL; struct nlattr *nla_a; struct nlattr *nla_b; int nla_a_rem; int nla_b_rem; u32 iter; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST] || !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST]) return -EINVAL; if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { kfree(doi_def); return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_std_failure; ret_val = -EINVAL; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { if (nla_validate_nested_deprecated(nla_a, NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { case NLBL_CIPSOV4_A_MLSLVLLOC: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_LOC_LVLS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->lvl.local_size) doi_def->map.std->lvl.local_size = nla_get_u32(nla_b) + 1; break; case NLBL_CIPSOV4_A_MLSLVLREM: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_REM_LVLS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->lvl.cipso_size) doi_def->map.std->lvl.cipso_size = nla_get_u32(nla_b) + 1; break; } } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; } for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++) doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL; for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++) doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { struct nlattr *lvl_loc; struct nlattr *lvl_rem; lvl_loc = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSLVLLOC); lvl_rem = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSLVLREM); if (lvl_loc == NULL || lvl_rem == NULL) goto add_std_failure; doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] = nla_get_u32(lvl_rem); doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] = nla_get_u32(lvl_loc); } if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) { if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSCATLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { if (nla_validate_nested_deprecated(nla_a, NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { case NLBL_CIPSOV4_A_MLSCATLOC: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_LOC_CATS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->cat.local_size) doi_def->map.std->cat.local_size = nla_get_u32(nla_b) + 1; break; case NLBL_CIPSOV4_A_MLSCATREM: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_REM_CATS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->cat.cipso_size) doi_def->map.std->cat.cipso_size = nla_get_u32(nla_b) + 1; break; } } doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; } for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++) doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT; for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++) doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSCATLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { struct nlattr *cat_loc; struct nlattr *cat_rem; cat_loc = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSCATLOC); cat_rem = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSCATREM); if (cat_loc == NULL || cat_rem == NULL) goto add_std_failure; doi_def->map.std->cat.local[ nla_get_u32(cat_loc)] = nla_get_u32(cat_rem); doi_def->map.std->cat.cipso[ nla_get_u32(cat_rem)] = nla_get_u32(cat_loc); } } ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_std_failure; return 0; add_std_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message * and add it to the CIPSO V4 engine. Return zero on success and non-zero on * error. * */ static int netlbl_cipsov4_add_pass(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def = NULL; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_PASS; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_pass_failure; ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_pass_failure; return 0; add_pass_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD * message and add it to the CIPSO V4 engine. Return zero on success and * non-zero on error. * */ static int netlbl_cipsov4_add_local(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def = NULL; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_LOCAL; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_local_failure; ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_local_failure; return 0; add_local_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add - Handle an ADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Create a new DOI definition based on the given ADD message and add it to the * CIPSO V4 engine. Returns zero on success, negative values on failure. * */ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_audit audit_info; if (!info->attrs[NLBL_CIPSOV4_A_DOI] || !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); break; case CIPSO_V4_MAP_PASS: ret_val = netlbl_cipsov4_add_pass(info, &audit_info); break; case CIPSO_V4_MAP_LOCAL: ret_val = netlbl_cipsov4_add_local(info, &audit_info); break; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); return ret_val; } /** * netlbl_cipsov4_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond accordingly. While the * response message generated by the kernel is straightforward, determining * before hand the size of the buffer to allocate is not (we have to generate * the message to know the size). In order to keep this function sane what we * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in * that size, if we fail then we restart with a larger buffer and try again. * We continue in this manner until we hit a limit of failed attempts then we * give up and just send an error message. Returns zero on success and * negative values on error. * */ static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info) { int ret_val; struct sk_buff *ans_skb = NULL; u32 nlsze_mult = 1; void *data; u32 doi; struct nlattr *nla_a; struct nlattr *nla_b; struct cipso_v4_doi *doi_def; u32 iter; if (!info->attrs[NLBL_CIPSOV4_A_DOI]) { ret_val = -EINVAL; goto list_failure; } list_start: ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE * nlsze_mult, GFP_KERNEL); if (ans_skb == NULL) { ret_val = -ENOMEM; goto list_failure; } data = genlmsg_put_reply(ans_skb, info, &netlbl_cipsov4_gnl_family, 0, NLBL_CIPSOV4_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); rcu_read_lock(); doi_def = cipso_v4_doi_getdef(doi); if (doi_def == NULL) { ret_val = -EINVAL; goto list_failure_lock; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); if (ret_val != 0) goto list_failure_lock; nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_TAGLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; } for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID; iter++) { ret_val = nla_put_u8(ans_skb, NLBL_CIPSOV4_A_TAG, doi_def->tags[iter]); if (ret_val != 0) goto list_failure_lock; } nla_nest_end(ans_skb, nla_a); switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; } for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++) { if (doi_def->map.std->lvl.local[iter] == CIPSO_V4_INV_LVL) continue; nla_b = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSLVL); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSLVLLOC, iter); if (ret_val != 0) goto list_retry; ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSLVLREM, doi_def->map.std->lvl.local[iter]); if (ret_val != 0) goto list_retry; nla_nest_end(ans_skb, nla_b); } nla_nest_end(ans_skb, nla_a); nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSCATLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_retry; } for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++) { if (doi_def->map.std->cat.local[iter] == CIPSO_V4_INV_CAT) continue; nla_b = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSCAT); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSCATLOC, iter); if (ret_val != 0) goto list_retry; ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSCATREM, doi_def->map.std->cat.local[iter]); if (ret_val != 0) goto list_retry; nla_nest_end(ans_skb, nla_b); } nla_nest_end(ans_skb, nla_a); break; } cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_retry: /* XXX - this limit is a guesstimate */ if (nlsze_mult < 4) { cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); kfree_skb(ans_skb); nlsze_mult *= 2; goto list_start; } list_failure_lock: cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL * @doi_def: the CIPSOv4 DOI definition * @arg: the netlbl_cipsov4_doiwalk_arg structure * * Description: * This function is designed to be used as a callback to the * cipso_v4_doi_walk() function for use in generating a response for a LISTALL * message. Returns the size of the message on success, negative values on * failure. * */ static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg) { int ret_val = -ENOMEM; struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg; void *data; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_cipsov4_gnl_family, NLM_F_MULTI, NLBL_CIPSOV4_C_LISTALL); if (data == NULL) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi); if (ret_val != 0) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); if (ret_val != 0) goto listall_cb_failure; genlmsg_end(cb_arg->skb, data); return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_cipsov4_listall - Handle a LISTALL message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated LISTALL message and respond accordingly. Returns * zero on success and negative values on error. * */ static int netlbl_cipsov4_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_cipsov4_doiwalk_arg cb_arg; u32 doi_skip = cb->args[0]; cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg); cb->args[0] = doi_skip; return skb->len; } /** * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE * @entry: LSM domain mapping entry * @arg: the netlbl_domhsh_walk_arg structure * * Description: * This function is intended for use by netlbl_cipsov4_remove() as the callback * for the netlbl_domhsh_walk() function; it removes LSM domain map entries * which are associated with the CIPSO DOI specified in @arg. Returns zero on * success, negative values on failure. * */ static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg) { struct netlbl_domhsh_walk_arg *cb_arg = arg; if (entry->def.type == NETLBL_NLTYPE_CIPSOV4 && entry->def.cipso->doi == cb_arg->doi) return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); return 0; } /** * netlbl_cipsov4_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVE message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_domhsh_walk_arg cb_arg; struct netlbl_audit audit_info; u32 skip_bkt = 0; u32 skip_chain = 0; if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, netlbl_cipsov4_remove_cb, &cb_arg); if (ret_val == 0 || ret_val == -ENOENT) { ret_val = cipso_v4_doi_remove(cb_arg.doi, &audit_info); if (ret_val == 0) atomic_dec(&netlabel_mgmt_protocount); } return ret_val; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_cipsov4_ops[] = { { .cmd = NLBL_CIPSOV4_C_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_cipsov4_add, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_REMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_cipsov4_remove, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_cipsov4_list, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LISTALL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_cipsov4_listall, }, }; static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CIPSOV4_A_MAX, .policy = netlbl_cipsov4_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_cipsov4_ops, .n_small_ops = ARRAY_SIZE(netlbl_cipsov4_ops), .resv_start_op = NLBL_CIPSOV4_C_LISTALL + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_cipsov4_genl_init - Register the CIPSOv4 NetLabel component * * Description: * Register the CIPSOv4 packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_cipsov4_genl_init(void) { return genl_register_family(&netlbl_cipsov4_gnl_family); } |
| 52 52 51 1 1 4 1 45 1 220 221 3 1 7 1 198 14 207 214 4 3 1 1 7 4 2 44 1 3 308 276 30 24 160 31 31 191 191 3 8 3 3 11 11 173 3 27 3 23 1 1 22 19 1 4 3 14 14 10 10 10 18 5 4 3 4 28 1 27 23 4 26 27 26 4 25 13 19 4 23 27 42 2 34 24 15 2 2 3 3 3 4 4 3 3 4 479 11 2 4 11 11 9 8 8 6 9 7 7 9 9 557 557 529 132 520 118 510 515 4 388 515 9 290 10 3 3 554 505 113 48 119 495 17 481 24 8 16 489 10 482 484 7 32 479 4 263 299 283 3 471 472 472 26 194 194 481 8 124 1 185 3 512 12 11 13 13 6 6 13 525 1 522 21 256 340 1 1 333 7 2 495 11 11 1 1 9 9 9 9 9 9 9 9 3 9 9 12 1 11 9 9 7 9 9 9 9 7 2 3 6 9 2 2 163 162 141 6 1 157 157 134 147 147 148 52 119 119 148 147 148 131 18 128 121 121 1 1 13 1 2 10 15 15 10 1 9 9 9 7 2 9 1 8 9 2 7 9 9 9 172 171 1 129 12 38 128 7 2 131 12 1 2 4 15 4 8 1 105 9 29 148 9 2 4 3 1 2 7 7 7 103 83 7 76 76 76 76 34 83 83 103 74 8 67 2 6 11 86 4 1 50 97 54 32 12 90 1 90 11 10 70 8 81 81 10 90 13 1 12 3 1 8 3 9 10 1 12 68 1 66 4 63 3 64 4 63 37 37 37 37 9 28 29 7 37 37 58 26 32 3 28 77 77 3 72 16 3 6 7 2 51 50 35 15 7 58 72 64 45 1 1 1 3 4 39 39 1 2 2 5 34 32 29 8 8 3 3 3 27 31 30 2 1 1 26 1 1 1 10 14 7 6 1 12 1 13 13 14 57 57 1 1 54 1 54 38 16 1 53 52 1 51 2 51 1 51 62 3 59 1 1 57 59 1 5 5 18 2 2 1 13 1 1 5 4 2 6 6 5 3 2 10 1 13 7 6 31 31 1 29 2 31 9 19 5 31 36 1 31 4 33 1 31 18 1 1 1 2 4 15 15 1 2 2 5 10 6 40 18 24 1 15 22 12 5 5 9 1 73 1 72 30 50 4 4 1 3 11 2 9 42 6 2 1 9 31 2 34 8 13 9 4 4 39 8 31 4 1 5 34 104 1 104 1 1 1 70 30 1 2 67 28 1 73 24 96 57 1 34 5 90 22 62 8 20 25 17 65 49 1 32 58 22 51 21 1 30 42 49 22 55 1 15 4 11 1 14 37 17 54 47 1 2 18 33 57 98 98 98 30 3 29 1 1 28 1 2 2 23 23 4 12 11 9 12 13 5 3 1 1 1 1 8 5 10 3 27 27 26 27 4 23 141 2 140 11 32 105 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) * Daniel Phillips, 2001 * Hash Tree Directory indexing porting * Christopher Li, 2002 * Hash Tree Directory indexing cleanup * Theodore Ts'o, 2002 */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { struct ext4_map_blocks map; struct buffer_head *bh; int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; map.m_lblk = *block; map.m_len = 1; /* * We're appending new directory block. Make sure the block is not * allocated yet, otherwise we will end up corrupting the * directory. */ err = ext4_map_blocks(NULL, inode, &map, 0); if (err < 0) return ERR_PTR(err); if (err) { EXT4_ERROR_INODE(inode, "Logical block already allocated"); return ERR_PTR(-EFSCORRUPTED); } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; return bh; out: brelse(bh); ext4_std_error(inode->i_sb, err); return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); /* * Hints to ext4_read_dirblock regarding whether we expect a directory * block being read to be an index block, or a block containing * directory entries (and if the latter, whether it was found via a * logical block in an htree index block). This is used to control * what sort of sanity checkinig ext4_read_dirblock() will do on the * directory block read from the storage device. EITHER will means * the caller doesn't know what kind of directory block will be read, * so no specific verification will be done. */ typedef enum { EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_lblk_t block, dirblock_type_t type, const char *func, unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; int is_dx_block = 0; if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); return ERR_PTR(-EFSCORRUPTED); } if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); return bh; } /* The first directory block must not be a hole. */ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, "Directory hole found for htree %s block %u", (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { if (block == 0) is_dx_block = 1; else if (ext4_rec_len_from_disk(dirent->rec_len, inode->i_sb->s_blocksize) == inode->i_sb->s_blocksize) is_dx_block = 1; } if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; /* * An empty leaf block can get mistaken for a index block; for * this reason, we can only check the index checksum when the * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { if (ext4_dx_csum_verify(inode, dirent) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { if (ext4_dirblock_csum_verify(inode, bh) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } return bh; } #ifdef DX_DEBUG #define dxtrace(command) command #else #define dxtrace(command) #endif struct fake_dirent { __le32 inode; __le16 rec_len; u8 name_len; u8 file_type; }; struct dx_countlimit { __le16 limit; __le16 count; }; struct dx_entry { __le32 hash; __le32 block; }; /* * dx_root_info is laid out so that if it should somehow get overlaid by a * dirent the two low bits of the hash version will be zero. Therefore, the * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ struct dx_root { struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; char dotdot_name[4]; struct dx_root_info { __le32 reserved_zero; u8 hash_version; u8 info_length; /* 8 */ u8 indirect_levels; u8 unused_flags; } info; struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; struct dx_entry entries[]; }; struct dx_frame { struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; }; struct dx_map_entry { u32 hash; u16 offs; u16 size; }; /* * This goes at the end of each htree block. */ struct dx_tail { u32 dt_reserved; __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); /* checksumming functions */ void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize) { struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( sizeof(struct ext4_dir_entry_tail), blocksize); t->det_reserved_ft = EXT4_FT_DIR_CSUM; } /* Walk through a dirent block to find a checksum "dirent" at the tail */ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + (blocksize - sizeof(struct ext4_dir_entry_tail))); while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; t = (struct ext4_dir_entry_tail *)d; #else t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); #endif if (t->det_reserved_zero1 || (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; return t; } static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } #define warn_no_space_for_csum(inode) \ __warn_no_space_for_csum((inode), __func__, __LINE__) static void __warn_no_space_for_csum(struct inode *inode, const char *func, unsigned int line) { __ext4_warning_inode(inode, func, line, "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return 0; } if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data)) return 0; return 1; } static void ext4_dirblock_csum_set(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return; } t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data); } int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dirblock_csum_set(inode, bh); return ext4_handle_dirty_metadata(handle, inode, bh); } static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) { struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); if (rlen == blocksize) count_offset = 8; else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || root->info_length != sizeof(struct dx_root_info)) return NULL; count_offset = 32; } else return NULL; if (offset) *offset = count_offset; return (struct dx_countlimit *)(((void *)dirent) + count_offset); } static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; __u32 dummy_csum = 0; int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(csum, (__u8 *)t, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, count, t)) return 0; return 1; } static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); } static inline int ext4_handle_dirty_dx_node(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); } /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) { entry->block = cpu_to_le32(value); } static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { printk(KERN_CONT " %x->%lu", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk(KERN_CONT "\n"); } struct stats { unsigned names; unsigned space; unsigned bcount; }; static struct stats dx_show_leaf(struct inode *dir, struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; printk("names: "); while ((char *) de < base + size) { if (de->inode) { if (show_names) { #ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); } else { struct fscrypt_str de_name = FSTR_INIT(name, len); /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" "\n"); name = "??"; len = 2; } else { name = fname_crypto_str.name; len = fname_crypto_str.len; } if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( &fname_crypto_str); } #else int len = de->name_len; char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); } printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { ext4_lblk_t block = dx_get_block(entries); ext4_lblk_t hash = i ? dx_get_hash(entries): 0; u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); if (!bh || IS_ERR(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; brelse(bh); } if (bcount) printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } /* * Linear search cross check */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { while (n--) { dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; break; } } ASSERT(at == target - 1); } #else /* DX_DEBUG */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { } #endif /* DX_DEBUG */ /* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format * error in the directory index, and the caller should fall back to * searching the directory normally. The callers of dx_probe **MUST** * check for this error code, and make sure it never gets reflected * back to userspace. */ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; ext4_lblk_t block; ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY && root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } if (ext4_hash_in_dirent(dir)) { if (root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash in dirent, but hash is not SIPHASH"); goto fail; } } else { if (root->info.hash_version == DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash code is SIPHASH, but hash not in dirent"); goto fail; } } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { int ret = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); if (ret < 0) { ret_err = ERR_PTR(ret); goto fail; } } hash = hinfo->hash; if (root->info.unused_flags & 1) { ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", root->info.unused_flags); goto fail; } indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, "Directory (ino: %lu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ext4_warning(dir->i_sb, "Enable large directory " "feature to access it"); } goto fail; } entries = (struct dx_entry *)(((char *)&root->info) + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", dx_get_limit(entries), dx_root_limit(dir, root->info.info_length)); goto fail; } dxtrace(printk("Look up %x", hash)); level = 0; blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning_inode(dir, "dx entry: count %u beyond limit %u", count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else p = m + 1; } htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; block = dx_get_block(at); for (i = 0; i <= level; i++) { if (blocks[i] == block) { ext4_warning_inode(dir, "dx entry: tree cycle block %u points back to block %u", blocks[level], block); goto fail; } } if (++level > indirect) return frame; blocks[level] = block; frame++; frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { ext4_warning_inode(dir, "dx entry: limit %u != node limit %u", dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning_inode(dir, "Corrupt directory, running e2fsck is recommended"); return ret_err; } static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; /* save local copy, "info" may be freed after brelse() */ indirect_levels = info->indirect_levels; for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); frames[i].bh = NULL; } } /* * This function increments the frame pointer to search the next leaf * block, and reads in the necessary intervening nodes if the search * should be necessary. Whether or not the search is necessary is * controlled by the hash parameter. If the hash value is even, then * the search is only continued if the next block starts with that * hash value. This is used if we are searching for a specific file. * * If the hash value is HASH_NB_ALWAYS, then always go to the next block. * * This function returns 1 if the caller should continue to search, * or 0 if it should not. If there is an error reading one of the * index blocks, it will a negative error code. * * If start_hash is non-null, it will be filled in with the starting * hash of the next page. */ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; struct buffer_head *bh; int num_frames = 0; __u32 bhash; p = frame; /* * Find the next leaf page by incrementing the frame pointer. * If we run out of entries in the interior node, loop around and * increment pointer in the parent node. When we break out of * this loop, num_frames indicates the number of interior * nodes need to be read. */ while (1) { if (++(p->at) < p->entries + dx_get_count(p->entries)) break; if (p == frames) return 0; num_frames++; p--; } /* * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { if ((bhash & ~1) != hash) return 0; } /* * If the hash is HASH_NB_ALWAYS, we always go to the next * block so no check is necessary */ while (num_frames--) { bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } return 1; } /* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ static int htree_dirblock_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash) { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ext4_dir_rec_len(0, csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; } err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); return err; } } for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; } if (ext4_hash_in_dirent(dir)) { if (de->name_len && de->inode) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hinfo->hash = 0; hinfo->minor_hash = 0; } } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err < 0) { count = err; goto errout; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { int save_len = fname_crypto_str.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); if (err) { count = err; goto errout; } err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); fname_crypto_str.len = save_len; } if (err != 0) { count = err; goto errout; } count++; } errout: brelse(bh); fscrypt_fname_free_buffer(&fname_crypto_str); return count; } /* * This function fills a red-black tree with information from a * directory. We start scanning the directory in hash order, starting * at start_hash and start_minor_hash. * * This function returns the number of entries inserted into the tree, * or a negative error code. */ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash) { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; int ret, err; __u32 hashval; struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { if (ext4_hash_in_dirent(dir)) hinfo.hash_version = DX_HASH_SIPHASH; else hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; count = ext4_inlinedir_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash, &has_inline_data); if (has_inline_data) { *next_hash = ~0; return count; } } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; return count; } hinfo.hash = start_hash; hinfo.minor_hash = 0; frame = dx_probe(NULL, dir, &hinfo, frames); if (IS_ERR(frame)) return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 0, 0, de, &tmp_str); if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 2, 0, de, &tmp_str); if (err != 0) goto errout; count++; } while (1) { if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { err = ret; goto errout; } count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { err = ret; goto errout; } /* * Stop if: (a) there are no more entries, or * (b) we have inserted at least one entry and the * next hash value is not a continuation */ if ((ret == 0) || (count && ((hashval & 1) == 0))) break; } dx_release(frames); dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); return (err); } static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, fname, offset, res_dir); } /* * Directory block splitting, compacting */ /* * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, ((char *)de) - base)) return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); else { int err = ext4fs_dirhash(dir, de->name, de->name_len, &h); if (err < 0) return err; } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; map_tail->size = ext4_rec_len_from_disk(de->rec_len, blocksize); count++; cond_resched(); } de = ext4_next_entry(de, blocksize); } return count; } /* Sort map by hash value */ static void dx_sort_map (struct dx_map_entry *map, unsigned count) { struct dx_map_entry *p, *q, *top = map + count - 1; int more; /* Combsort until bubble sort doesn't suck */ while (count > 2) { count = count*10/13; if (count - 9 < 2) /* 9, 10 -> 11 */ count = 11; for (p = top, q = p - count; q >= map; p--, q--) if (p->hash < q->hash) swap(*p, *q); } /* Garden variety bubble sort */ do { more = 0; q = top; while (q-- > map) { if (q[1].hash >= q[0].hash) continue; swap(*(q+1), *q); more = 1; } } while(more); } static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); ASSERT(count < dx_get_limit(entries)); ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); } #if IS_ENABLED(CONFIG_UNICODE) int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { struct qstr *cf_name = &name->cf_name; unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!buf) return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { kfree(buf); buf = NULL; } cf_name->name = buf; cf_name->len = (unsigned) len; if (!IS_ENCRYPTED(dir)) return 0; hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { struct fscrypt_name f; if (!de->inode) return false; f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { /* * Just checking IS_ENCRYPTED(parent) below is not * sufficient to decide whether one can use the hash for * skipping the string comparison, because the key might * have been added right after * ext4_fname_setup_ci_filename(). In this case, a hash * mismatch will be a false negative. Therefore, make * sure cf_name was properly initialized before * considering the calculated hash. */ if (sb_no_casefold_compat_fallback(parent->i_sb) && IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; /* * Treat comparison errors as not a match. The * only case where it happens is on a disk * corruption or ENOMEM. */ return generic_ci_match(parent, fname->usr_fname, &fname->cf_name, de->name, de->name_len) > 0; } #endif return fscrypt_match_name(&f, de->name, de->name_len); } /* * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) return -EFSCORRUPTED; *res_dir = de; return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, struct ext4_dir_entry *de) { struct super_block *sb = dir->i_sb; if (!is_dx(dir)) return 0; if (block == 0) return 1; if (de->inode == 0 && ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == sb->s_blocksize) return 1; return 0; } /* * __ext4_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ static struct buffer_head *__ext4_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; const u8 *name = fname->usr_fname->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ size_t ra_ptr = 0; /* Current index into readahead buffer */ ext4_lblk_t nblocks; int i, namelen, retval; *res_dir = NULL; sb = dir->i_sb; namelen = fname->usr_fname->len; if (namelen > EXT4_NAME_LEN) return NULL; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; ret = ext4_find_inline_entry(dir, fname, res_dir, &has_inline_data); if (inlined) *inlined = has_inline_data; if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS */ block = start = 0; nblocks = 1; goto restart; } if (is_dx(dir)) { ret = ext4_dx_find_entry(dir, fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); else if (!sb_no_casefold_compat_fallback(dir->i_sb) && *res_dir == NULL && IS_CASEFOLDED(dir)) dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " "failed, falling back\n")); else goto cleanup_and_exit; ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { ret = NULL; goto cleanup_and_exit; } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; if (block < start) ra_max = start - block; else ra_max = nblocks - block; ra_max = min(ra_max, ARRAY_SIZE(bh_use)); retval = ext4_bread_batch(dir, block, ra_max, false /* wait */, bh_use); if (retval) { ret = ERR_PTR(retval); ra_max = 0; goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_ERR(dir, EIO, "reading directory lblock %lu", (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) { ret = ERR_PTR(i); goto cleanup_and_exit; } } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); return ret; } static struct buffer_head *ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *inlined) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_setup_filename(dir, d_name, 1, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct dentry *dentry, struct ext4_dir_entry_2 **res_dir) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; #ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; retval = search_dirblock(bh, dir, fname, EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning_inode(dir, "error %d reading directory index block", retval); bh = ERR_PTR(retval); goto errout; } } while (retval == 1); bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de = NULL; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); bh = ext4_lookup_entry(dir, dentry, &de); if (IS_ERR(bh)) return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } } if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry * from being cached. */ return NULL; } return d_splice_alias(inode, dentry); } struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; struct ext4_dir_entry_2 * de = NULL; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = ext4_dir_rec_len(de->name_len, dir); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); map++; to += rec_len; } return (struct ext4_dir_entry_2 *) (to - rec_len); } /* * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } de = next; } return prev; } /* * Split a full leaf block to make room for a new dir entry. * Allocate a new block, and move entries so that they are approx. equally full. * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned continued; int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; int csum_size = 0; int err = 0, i; if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; data2 = bh2->b_data; /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); count = dx_make_map(dir, *bh, hinfo, map); if (count < 0) { err = count; goto journal_error; } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { /* is more than half of this entry in 2nd half of the block? */ if (size + map[i].size/2 > blocksize/2) break; size += map[i].size; move++; } /* * map index at which we will split * * If the sum of active entries didn't exceed half the block size, just |